mirror of
https://github.com/openssl/openssl.git
synced 2024-12-03 05:41:46 +08:00
1aa89a7a3a
They now generally conform to the following argument sequence: script.pl "$(PERLASM_SCHEME)" [ C preprocessor arguments ... ] \ $(PROCESSOR) <output file> However, in the spirit of being able to use these scripts manually, they also allow for no argument, or for only the flavour, or for only the output file. This is done by only using the last argument as output file if it's a file (it has an extension), and only using the first argument as flavour if it isn't a file (it doesn't have an extension). While we're at it, we make all $xlate calls the same, i.e. the $output argument is always quoted, and we always die on error when trying to start $xlate. There's a perl lesson in this, regarding operator priority... This will always succeed, even when it fails: open FOO, "something" || die "ERR: $!"; The reason is that '||' has higher priority than list operators (a function is essentially a list operator and gobbles up everything following it that isn't lower priority), and since a non-empty string is always true, so that ends up being exactly the same as: open FOO, "something"; This, however, will fail if "something" can't be opened: open FOO, "something" or die "ERR: $!"; The reason is that 'or' has lower priority that list operators, i.e. it's performed after the 'open' call. Reviewed-by: Matt Caswell <matt@openssl.org> (Merged from https://github.com/openssl/openssl/pull/9884)
2266 lines
48 KiB
Raku
2266 lines
48 KiB
Raku
#! /usr/bin/env perl
|
|
# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License 2.0 (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
|
|
#
|
|
# ====================================================================
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
# project.
|
|
#
|
|
# Rights for redistribution and usage in source and binary forms are
|
|
# granted according to the License. Warranty of any kind is disclaimed.
|
|
# ====================================================================
|
|
|
|
|
|
# July 1999
|
|
#
|
|
# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
|
|
#
|
|
# The module is designed to work with either of the "new" MIPS ABI(5),
|
|
# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under
|
|
# IRIX 5.x not only because it doesn't support new ABIs but also
|
|
# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
|
|
# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
|
|
# cause illegal instruction exception:-(
|
|
#
|
|
# In addition the code depends on preprocessor flags set up by MIPSpro
|
|
# compiler driver (either as or cc) and therefore (probably?) can't be
|
|
# compiled by the GNU assembler. GNU C driver manages fine though...
|
|
# I mean as long as -mmips-as is specified or is the default option,
|
|
# because then it simply invokes /usr/bin/as which in turn takes
|
|
# perfect care of the preprocessor definitions. Another neat feature
|
|
# offered by the MIPSpro assembler is an optimization pass. This gave
|
|
# me the opportunity to have the code looking more regular as all those
|
|
# architecture dependent instruction rescheduling details were left to
|
|
# the assembler. Cool, huh?
|
|
#
|
|
# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
|
|
# goes way over 3 times faster!
|
|
#
|
|
# <appro@openssl.org>
|
|
|
|
# October 2010
|
|
#
|
|
# Adapt the module even for 32-bit ABIs and other OSes. The former was
|
|
# achieved by mechanical replacement of 64-bit arithmetic instructions
|
|
# such as dmultu, daddu, etc. with their 32-bit counterparts and
|
|
# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
|
|
# >3x performance improvement naturally does not apply to 32-bit code
|
|
# [because there is no instruction 32-bit compiler can't use], one
|
|
# has to content with 40-85% improvement depending on benchmark and
|
|
# key length, more for longer keys.
|
|
|
|
# $output is the last argument if it looks like a file (it has an extension)
|
|
# $flavour is the first argument if it doesn't look like a file
|
|
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
|
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : "o32";
|
|
|
|
if ($flavour =~ /64|n32/i) {
|
|
$LD="ld";
|
|
$ST="sd";
|
|
$MULTU="dmultu";
|
|
$DIVU="ddivu";
|
|
$ADDU="daddu";
|
|
$SUBU="dsubu";
|
|
$SRL="dsrl";
|
|
$SLL="dsll";
|
|
$BNSZ=8;
|
|
$PTR_ADD="daddu";
|
|
$PTR_SUB="dsubu";
|
|
$SZREG=8;
|
|
$REG_S="sd";
|
|
$REG_L="ld";
|
|
} else {
|
|
$LD="lw";
|
|
$ST="sw";
|
|
$MULTU="multu";
|
|
$DIVU="divu";
|
|
$ADDU="addu";
|
|
$SUBU="subu";
|
|
$SRL="srl";
|
|
$SLL="sll";
|
|
$BNSZ=4;
|
|
$PTR_ADD="addu";
|
|
$PTR_SUB="subu";
|
|
$SZREG=4;
|
|
$REG_S="sw";
|
|
$REG_L="lw";
|
|
$code="#if !(defined (__mips_isa_rev) && (__mips_isa_rev >= 6))\n.set mips2\n#endif\n";
|
|
}
|
|
|
|
$output and open STDOUT,">$output";
|
|
|
|
# Below is N32/64 register layout used in the original module.
|
|
#
|
|
($zero,$at,$v0,$v1)=map("\$$_",(0..3));
|
|
($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
|
|
($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
|
|
($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
|
|
($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
|
|
($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
|
|
#
|
|
# No special adaptation is required for O32. NUBI on the other hand
|
|
# is treated by saving/restoring ($v1,$t0..$t3).
|
|
|
|
$gp=$v1 if ($flavour =~ /nubi/i);
|
|
|
|
$minus4=$v1;
|
|
|
|
$code.=<<___;
|
|
#include "mips_arch.h"
|
|
|
|
#if defined(_MIPS_ARCH_MIPS64R6)
|
|
# define ddivu(rs,rt)
|
|
# define mfqt(rd,rs,rt) ddivu rd,rs,rt
|
|
# define mfrm(rd,rs,rt) dmodu rd,rs,rt
|
|
#elif defined(_MIPS_ARCH_MIPS32R6)
|
|
# define divu(rs,rt)
|
|
# define mfqt(rd,rs,rt) divu rd,rs,rt
|
|
# define mfrm(rd,rs,rt) modu rd,rs,rt
|
|
#else
|
|
# define $DIVU(rs,rt) $DIVU $zero,rs,rt
|
|
# define mfqt(rd,rs,rt) mflo rd
|
|
# define mfrm(rd,rs,rt) mfhi rd
|
|
#endif
|
|
|
|
.rdata
|
|
.asciiz "mips3.s, Version 1.2"
|
|
.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
|
|
|
|
.text
|
|
.set noat
|
|
|
|
.align 5
|
|
.globl bn_mul_add_words
|
|
.ent bn_mul_add_words
|
|
bn_mul_add_words:
|
|
.set noreorder
|
|
bgtz $a2,bn_mul_add_words_internal
|
|
move $v0,$zero
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_mul_add_words
|
|
|
|
.align 5
|
|
.ent bn_mul_add_words_internal
|
|
bn_mul_add_words_internal:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
li $minus4,-4
|
|
and $ta0,$a2,$minus4
|
|
beqz $ta0,.L_bn_mul_add_words_tail
|
|
|
|
.L_bn_mul_add_words_loop:
|
|
$LD $t0,0($a1)
|
|
$MULTU ($t0,$a3)
|
|
$LD $t1,0($a0)
|
|
$LD $t2,$BNSZ($a1)
|
|
$LD $t3,$BNSZ($a0)
|
|
$LD $ta0,2*$BNSZ($a1)
|
|
$LD $ta1,2*$BNSZ($a0)
|
|
$ADDU $t1,$v0
|
|
sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit
|
|
# values", but it seems to work fine
|
|
# even on 64-bit registers.
|
|
mflo ($at,$t0,$a3)
|
|
mfhi ($t0,$t0,$a3)
|
|
$ADDU $t1,$at
|
|
$ADDU $v0,$t0
|
|
$MULTU ($t2,$a3)
|
|
sltu $at,$t1,$at
|
|
$ST $t1,0($a0)
|
|
$ADDU $v0,$at
|
|
|
|
$LD $ta2,3*$BNSZ($a1)
|
|
$LD $ta3,3*$BNSZ($a0)
|
|
$ADDU $t3,$v0
|
|
sltu $v0,$t3,$v0
|
|
mflo ($at,$t2,$a3)
|
|
mfhi ($t2,$t2,$a3)
|
|
$ADDU $t3,$at
|
|
$ADDU $v0,$t2
|
|
$MULTU ($ta0,$a3)
|
|
sltu $at,$t3,$at
|
|
$ST $t3,$BNSZ($a0)
|
|
$ADDU $v0,$at
|
|
|
|
subu $a2,4
|
|
$PTR_ADD $a0,4*$BNSZ
|
|
$PTR_ADD $a1,4*$BNSZ
|
|
$ADDU $ta1,$v0
|
|
sltu $v0,$ta1,$v0
|
|
mflo ($at,$ta0,$a3)
|
|
mfhi ($ta0,$ta0,$a3)
|
|
$ADDU $ta1,$at
|
|
$ADDU $v0,$ta0
|
|
$MULTU ($ta2,$a3)
|
|
sltu $at,$ta1,$at
|
|
$ST $ta1,-2*$BNSZ($a0)
|
|
$ADDU $v0,$at
|
|
|
|
|
|
and $ta0,$a2,$minus4
|
|
$ADDU $ta3,$v0
|
|
sltu $v0,$ta3,$v0
|
|
mflo ($at,$ta2,$a3)
|
|
mfhi ($ta2,$ta2,$a3)
|
|
$ADDU $ta3,$at
|
|
$ADDU $v0,$ta2
|
|
sltu $at,$ta3,$at
|
|
$ST $ta3,-$BNSZ($a0)
|
|
.set noreorder
|
|
bgtz $ta0,.L_bn_mul_add_words_loop
|
|
$ADDU $v0,$at
|
|
|
|
beqz $a2,.L_bn_mul_add_words_return
|
|
nop
|
|
|
|
.L_bn_mul_add_words_tail:
|
|
.set reorder
|
|
$LD $t0,0($a1)
|
|
$MULTU ($t0,$a3)
|
|
$LD $t1,0($a0)
|
|
subu $a2,1
|
|
$ADDU $t1,$v0
|
|
sltu $v0,$t1,$v0
|
|
mflo ($at,$t0,$a3)
|
|
mfhi ($t0,$t0,$a3)
|
|
$ADDU $t1,$at
|
|
$ADDU $v0,$t0
|
|
sltu $at,$t1,$at
|
|
$ST $t1,0($a0)
|
|
$ADDU $v0,$at
|
|
beqz $a2,.L_bn_mul_add_words_return
|
|
|
|
$LD $t0,$BNSZ($a1)
|
|
$MULTU ($t0,$a3)
|
|
$LD $t1,$BNSZ($a0)
|
|
subu $a2,1
|
|
$ADDU $t1,$v0
|
|
sltu $v0,$t1,$v0
|
|
mflo ($at,$t0,$a3)
|
|
mfhi ($t0,$t0,$a3)
|
|
$ADDU $t1,$at
|
|
$ADDU $v0,$t0
|
|
sltu $at,$t1,$at
|
|
$ST $t1,$BNSZ($a0)
|
|
$ADDU $v0,$at
|
|
beqz $a2,.L_bn_mul_add_words_return
|
|
|
|
$LD $t0,2*$BNSZ($a1)
|
|
$MULTU ($t0,$a3)
|
|
$LD $t1,2*$BNSZ($a0)
|
|
$ADDU $t1,$v0
|
|
sltu $v0,$t1,$v0
|
|
mflo ($at,$t0,$a3)
|
|
mfhi ($t0,$t0,$a3)
|
|
$ADDU $t1,$at
|
|
$ADDU $v0,$t0
|
|
sltu $at,$t1,$at
|
|
$ST $t1,2*$BNSZ($a0)
|
|
$ADDU $v0,$at
|
|
|
|
.L_bn_mul_add_words_return:
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_mul_add_words_internal
|
|
|
|
.align 5
|
|
.globl bn_mul_words
|
|
.ent bn_mul_words
|
|
bn_mul_words:
|
|
.set noreorder
|
|
bgtz $a2,bn_mul_words_internal
|
|
move $v0,$zero
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_mul_words
|
|
|
|
.align 5
|
|
.ent bn_mul_words_internal
|
|
bn_mul_words_internal:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
li $minus4,-4
|
|
and $ta0,$a2,$minus4
|
|
beqz $ta0,.L_bn_mul_words_tail
|
|
|
|
.L_bn_mul_words_loop:
|
|
$LD $t0,0($a1)
|
|
$MULTU ($t0,$a3)
|
|
$LD $t2,$BNSZ($a1)
|
|
$LD $ta0,2*$BNSZ($a1)
|
|
$LD $ta2,3*$BNSZ($a1)
|
|
mflo ($at,$t0,$a3)
|
|
mfhi ($t0,$t0,$a3)
|
|
$ADDU $v0,$at
|
|
sltu $t1,$v0,$at
|
|
$MULTU ($t2,$a3)
|
|
$ST $v0,0($a0)
|
|
$ADDU $v0,$t1,$t0
|
|
|
|
subu $a2,4
|
|
$PTR_ADD $a0,4*$BNSZ
|
|
$PTR_ADD $a1,4*$BNSZ
|
|
mflo ($at,$t2,$a3)
|
|
mfhi ($t2,$t2,$a3)
|
|
$ADDU $v0,$at
|
|
sltu $t3,$v0,$at
|
|
$MULTU ($ta0,$a3)
|
|
$ST $v0,-3*$BNSZ($a0)
|
|
$ADDU $v0,$t3,$t2
|
|
|
|
mflo ($at,$ta0,$a3)
|
|
mfhi ($ta0,$ta0,$a3)
|
|
$ADDU $v0,$at
|
|
sltu $ta1,$v0,$at
|
|
$MULTU ($ta2,$a3)
|
|
$ST $v0,-2*$BNSZ($a0)
|
|
$ADDU $v0,$ta1,$ta0
|
|
|
|
and $ta0,$a2,$minus4
|
|
mflo ($at,$ta2,$a3)
|
|
mfhi ($ta2,$ta2,$a3)
|
|
$ADDU $v0,$at
|
|
sltu $ta3,$v0,$at
|
|
$ST $v0,-$BNSZ($a0)
|
|
.set noreorder
|
|
bgtz $ta0,.L_bn_mul_words_loop
|
|
$ADDU $v0,$ta3,$ta2
|
|
|
|
beqz $a2,.L_bn_mul_words_return
|
|
nop
|
|
|
|
.L_bn_mul_words_tail:
|
|
.set reorder
|
|
$LD $t0,0($a1)
|
|
$MULTU ($t0,$a3)
|
|
subu $a2,1
|
|
mflo ($at,$t0,$a3)
|
|
mfhi ($t0,$t0,$a3)
|
|
$ADDU $v0,$at
|
|
sltu $t1,$v0,$at
|
|
$ST $v0,0($a0)
|
|
$ADDU $v0,$t1,$t0
|
|
beqz $a2,.L_bn_mul_words_return
|
|
|
|
$LD $t0,$BNSZ($a1)
|
|
$MULTU ($t0,$a3)
|
|
subu $a2,1
|
|
mflo ($at,$t0,$a3)
|
|
mfhi ($t0,$t0,$a3)
|
|
$ADDU $v0,$at
|
|
sltu $t1,$v0,$at
|
|
$ST $v0,$BNSZ($a0)
|
|
$ADDU $v0,$t1,$t0
|
|
beqz $a2,.L_bn_mul_words_return
|
|
|
|
$LD $t0,2*$BNSZ($a1)
|
|
$MULTU ($t0,$a3)
|
|
mflo ($at,$t0,$a3)
|
|
mfhi ($t0,$t0,$a3)
|
|
$ADDU $v0,$at
|
|
sltu $t1,$v0,$at
|
|
$ST $v0,2*$BNSZ($a0)
|
|
$ADDU $v0,$t1,$t0
|
|
|
|
.L_bn_mul_words_return:
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_mul_words_internal
|
|
|
|
.align 5
|
|
.globl bn_sqr_words
|
|
.ent bn_sqr_words
|
|
bn_sqr_words:
|
|
.set noreorder
|
|
bgtz $a2,bn_sqr_words_internal
|
|
move $v0,$zero
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_sqr_words
|
|
|
|
.align 5
|
|
.ent bn_sqr_words_internal
|
|
bn_sqr_words_internal:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
li $minus4,-4
|
|
and $ta0,$a2,$minus4
|
|
beqz $ta0,.L_bn_sqr_words_tail
|
|
|
|
.L_bn_sqr_words_loop:
|
|
$LD $t0,0($a1)
|
|
$MULTU ($t0,$t0)
|
|
$LD $t2,$BNSZ($a1)
|
|
$LD $ta0,2*$BNSZ($a1)
|
|
$LD $ta2,3*$BNSZ($a1)
|
|
mflo ($t1,$t0,$t0)
|
|
mfhi ($t0,$t0,$t0)
|
|
$ST $t1,0($a0)
|
|
$ST $t0,$BNSZ($a0)
|
|
|
|
$MULTU ($t2,$t2)
|
|
subu $a2,4
|
|
$PTR_ADD $a0,8*$BNSZ
|
|
$PTR_ADD $a1,4*$BNSZ
|
|
mflo ($t3,$t2,$t2)
|
|
mfhi ($t2,$t2,$t2)
|
|
$ST $t3,-6*$BNSZ($a0)
|
|
$ST $t2,-5*$BNSZ($a0)
|
|
|
|
$MULTU ($ta0,$ta0)
|
|
mflo ($ta1,$ta0,$ta0)
|
|
mfhi ($ta0,$ta0,$ta0)
|
|
$ST $ta1,-4*$BNSZ($a0)
|
|
$ST $ta0,-3*$BNSZ($a0)
|
|
|
|
|
|
$MULTU ($ta2,$ta2)
|
|
and $ta0,$a2,$minus4
|
|
mflo ($ta3,$ta2,$ta2)
|
|
mfhi ($ta2,$ta2,$ta2)
|
|
$ST $ta3,-2*$BNSZ($a0)
|
|
|
|
.set noreorder
|
|
bgtz $ta0,.L_bn_sqr_words_loop
|
|
$ST $ta2,-$BNSZ($a0)
|
|
|
|
beqz $a2,.L_bn_sqr_words_return
|
|
nop
|
|
|
|
.L_bn_sqr_words_tail:
|
|
.set reorder
|
|
$LD $t0,0($a1)
|
|
$MULTU ($t0,$t0)
|
|
subu $a2,1
|
|
mflo ($t1,$t0,$t0)
|
|
mfhi ($t0,$t0,$t0)
|
|
$ST $t1,0($a0)
|
|
$ST $t0,$BNSZ($a0)
|
|
beqz $a2,.L_bn_sqr_words_return
|
|
|
|
$LD $t0,$BNSZ($a1)
|
|
$MULTU ($t0,$t0)
|
|
subu $a2,1
|
|
mflo ($t1,$t0,$t0)
|
|
mfhi ($t0,$t0,$t0)
|
|
$ST $t1,2*$BNSZ($a0)
|
|
$ST $t0,3*$BNSZ($a0)
|
|
beqz $a2,.L_bn_sqr_words_return
|
|
|
|
$LD $t0,2*$BNSZ($a1)
|
|
$MULTU ($t0,$t0)
|
|
mflo ($t1,$t0,$t0)
|
|
mfhi ($t0,$t0,$t0)
|
|
$ST $t1,4*$BNSZ($a0)
|
|
$ST $t0,5*$BNSZ($a0)
|
|
|
|
.L_bn_sqr_words_return:
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
move $a0,$v0
|
|
|
|
.end bn_sqr_words_internal
|
|
|
|
.align 5
|
|
.globl bn_add_words
|
|
.ent bn_add_words
|
|
bn_add_words:
|
|
.set noreorder
|
|
bgtz $a3,bn_add_words_internal
|
|
move $v0,$zero
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_add_words
|
|
|
|
.align 5
|
|
.ent bn_add_words_internal
|
|
bn_add_words_internal:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
li $minus4,-4
|
|
and $at,$a3,$minus4
|
|
beqz $at,.L_bn_add_words_tail
|
|
|
|
.L_bn_add_words_loop:
|
|
$LD $t0,0($a1)
|
|
$LD $ta0,0($a2)
|
|
subu $a3,4
|
|
$LD $t1,$BNSZ($a1)
|
|
and $at,$a3,$minus4
|
|
$LD $t2,2*$BNSZ($a1)
|
|
$PTR_ADD $a2,4*$BNSZ
|
|
$LD $t3,3*$BNSZ($a1)
|
|
$PTR_ADD $a0,4*$BNSZ
|
|
$LD $ta1,-3*$BNSZ($a2)
|
|
$PTR_ADD $a1,4*$BNSZ
|
|
$LD $ta2,-2*$BNSZ($a2)
|
|
$LD $ta3,-$BNSZ($a2)
|
|
$ADDU $ta0,$t0
|
|
sltu $t8,$ta0,$t0
|
|
$ADDU $t0,$ta0,$v0
|
|
sltu $v0,$t0,$ta0
|
|
$ST $t0,-4*$BNSZ($a0)
|
|
$ADDU $v0,$t8
|
|
|
|
$ADDU $ta1,$t1
|
|
sltu $t9,$ta1,$t1
|
|
$ADDU $t1,$ta1,$v0
|
|
sltu $v0,$t1,$ta1
|
|
$ST $t1,-3*$BNSZ($a0)
|
|
$ADDU $v0,$t9
|
|
|
|
$ADDU $ta2,$t2
|
|
sltu $t8,$ta2,$t2
|
|
$ADDU $t2,$ta2,$v0
|
|
sltu $v0,$t2,$ta2
|
|
$ST $t2,-2*$BNSZ($a0)
|
|
$ADDU $v0,$t8
|
|
|
|
$ADDU $ta3,$t3
|
|
sltu $t9,$ta3,$t3
|
|
$ADDU $t3,$ta3,$v0
|
|
sltu $v0,$t3,$ta3
|
|
$ST $t3,-$BNSZ($a0)
|
|
|
|
.set noreorder
|
|
bgtz $at,.L_bn_add_words_loop
|
|
$ADDU $v0,$t9
|
|
|
|
beqz $a3,.L_bn_add_words_return
|
|
nop
|
|
|
|
.L_bn_add_words_tail:
|
|
.set reorder
|
|
$LD $t0,0($a1)
|
|
$LD $ta0,0($a2)
|
|
$ADDU $ta0,$t0
|
|
subu $a3,1
|
|
sltu $t8,$ta0,$t0
|
|
$ADDU $t0,$ta0,$v0
|
|
sltu $v0,$t0,$ta0
|
|
$ST $t0,0($a0)
|
|
$ADDU $v0,$t8
|
|
beqz $a3,.L_bn_add_words_return
|
|
|
|
$LD $t1,$BNSZ($a1)
|
|
$LD $ta1,$BNSZ($a2)
|
|
$ADDU $ta1,$t1
|
|
subu $a3,1
|
|
sltu $t9,$ta1,$t1
|
|
$ADDU $t1,$ta1,$v0
|
|
sltu $v0,$t1,$ta1
|
|
$ST $t1,$BNSZ($a0)
|
|
$ADDU $v0,$t9
|
|
beqz $a3,.L_bn_add_words_return
|
|
|
|
$LD $t2,2*$BNSZ($a1)
|
|
$LD $ta2,2*$BNSZ($a2)
|
|
$ADDU $ta2,$t2
|
|
sltu $t8,$ta2,$t2
|
|
$ADDU $t2,$ta2,$v0
|
|
sltu $v0,$t2,$ta2
|
|
$ST $t2,2*$BNSZ($a0)
|
|
$ADDU $v0,$t8
|
|
|
|
.L_bn_add_words_return:
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
move $a0,$v0
|
|
|
|
.end bn_add_words_internal
|
|
|
|
.align 5
|
|
.globl bn_sub_words
|
|
.ent bn_sub_words
|
|
bn_sub_words:
|
|
.set noreorder
|
|
bgtz $a3,bn_sub_words_internal
|
|
move $v0,$zero
|
|
jr $ra
|
|
move $a0,$zero
|
|
.end bn_sub_words
|
|
|
|
.align 5
|
|
.ent bn_sub_words_internal
|
|
bn_sub_words_internal:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
li $minus4,-4
|
|
and $at,$a3,$minus4
|
|
beqz $at,.L_bn_sub_words_tail
|
|
|
|
.L_bn_sub_words_loop:
|
|
$LD $t0,0($a1)
|
|
$LD $ta0,0($a2)
|
|
subu $a3,4
|
|
$LD $t1,$BNSZ($a1)
|
|
and $at,$a3,$minus4
|
|
$LD $t2,2*$BNSZ($a1)
|
|
$PTR_ADD $a2,4*$BNSZ
|
|
$LD $t3,3*$BNSZ($a1)
|
|
$PTR_ADD $a0,4*$BNSZ
|
|
$LD $ta1,-3*$BNSZ($a2)
|
|
$PTR_ADD $a1,4*$BNSZ
|
|
$LD $ta2,-2*$BNSZ($a2)
|
|
$LD $ta3,-$BNSZ($a2)
|
|
sltu $t8,$t0,$ta0
|
|
$SUBU $ta0,$t0,$ta0
|
|
$SUBU $t0,$ta0,$v0
|
|
sgtu $v0,$t0,$ta0
|
|
$ST $t0,-4*$BNSZ($a0)
|
|
$ADDU $v0,$t8
|
|
|
|
sltu $t9,$t1,$ta1
|
|
$SUBU $ta1,$t1,$ta1
|
|
$SUBU $t1,$ta1,$v0
|
|
sgtu $v0,$t1,$ta1
|
|
$ST $t1,-3*$BNSZ($a0)
|
|
$ADDU $v0,$t9
|
|
|
|
|
|
sltu $t8,$t2,$ta2
|
|
$SUBU $ta2,$t2,$ta2
|
|
$SUBU $t2,$ta2,$v0
|
|
sgtu $v0,$t2,$ta2
|
|
$ST $t2,-2*$BNSZ($a0)
|
|
$ADDU $v0,$t8
|
|
|
|
sltu $t9,$t3,$ta3
|
|
$SUBU $ta3,$t3,$ta3
|
|
$SUBU $t3,$ta3,$v0
|
|
sgtu $v0,$t3,$ta3
|
|
$ST $t3,-$BNSZ($a0)
|
|
|
|
.set noreorder
|
|
bgtz $at,.L_bn_sub_words_loop
|
|
$ADDU $v0,$t9
|
|
|
|
beqz $a3,.L_bn_sub_words_return
|
|
nop
|
|
|
|
.L_bn_sub_words_tail:
|
|
.set reorder
|
|
$LD $t0,0($a1)
|
|
$LD $ta0,0($a2)
|
|
subu $a3,1
|
|
sltu $t8,$t0,$ta0
|
|
$SUBU $ta0,$t0,$ta0
|
|
$SUBU $t0,$ta0,$v0
|
|
sgtu $v0,$t0,$ta0
|
|
$ST $t0,0($a0)
|
|
$ADDU $v0,$t8
|
|
beqz $a3,.L_bn_sub_words_return
|
|
|
|
$LD $t1,$BNSZ($a1)
|
|
subu $a3,1
|
|
$LD $ta1,$BNSZ($a2)
|
|
sltu $t9,$t1,$ta1
|
|
$SUBU $ta1,$t1,$ta1
|
|
$SUBU $t1,$ta1,$v0
|
|
sgtu $v0,$t1,$ta1
|
|
$ST $t1,$BNSZ($a0)
|
|
$ADDU $v0,$t9
|
|
beqz $a3,.L_bn_sub_words_return
|
|
|
|
$LD $t2,2*$BNSZ($a1)
|
|
$LD $ta2,2*$BNSZ($a2)
|
|
sltu $t8,$t2,$ta2
|
|
$SUBU $ta2,$t2,$ta2
|
|
$SUBU $t2,$ta2,$v0
|
|
sgtu $v0,$t2,$ta2
|
|
$ST $t2,2*$BNSZ($a0)
|
|
$ADDU $v0,$t8
|
|
|
|
.L_bn_sub_words_return:
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_sub_words_internal
|
|
|
|
#if 0
|
|
/*
|
|
* The bn_div_3_words entry point is re-used for constant-time interface.
|
|
* Implementation is retained as historical reference.
|
|
*/
|
|
.align 5
|
|
.globl bn_div_3_words
|
|
.ent bn_div_3_words
|
|
bn_div_3_words:
|
|
.set noreorder
|
|
move $a3,$a0 # we know that bn_div_words does not
|
|
# touch $a3, $ta2, $ta3 and preserves $a2
|
|
# so that we can save two arguments
|
|
# and return address in registers
|
|
# instead of stack:-)
|
|
|
|
$LD $a0,($a3)
|
|
move $ta2,$a1
|
|
bne $a0,$a2,bn_div_3_words_internal
|
|
$LD $a1,-$BNSZ($a3)
|
|
li $v0,-1
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_div_3_words
|
|
|
|
.align 5
|
|
.ent bn_div_3_words_internal
|
|
bn_div_3_words_internal:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
move $ta3,$ra
|
|
bal bn_div_words_internal
|
|
move $ra,$ta3
|
|
$MULTU ($ta2,$v0)
|
|
$LD $t2,-2*$BNSZ($a3)
|
|
move $ta0,$zero
|
|
mfhi ($t1,$ta2,$v0)
|
|
mflo ($t0,$ta2,$v0)
|
|
sltu $t8,$t1,$a1
|
|
.L_bn_div_3_words_inner_loop:
|
|
bnez $t8,.L_bn_div_3_words_inner_loop_done
|
|
sgeu $at,$t2,$t0
|
|
seq $t9,$t1,$a1
|
|
and $at,$t9
|
|
sltu $t3,$t0,$ta2
|
|
$ADDU $a1,$a2
|
|
$SUBU $t1,$t3
|
|
$SUBU $t0,$ta2
|
|
sltu $t8,$t1,$a1
|
|
sltu $ta0,$a1,$a2
|
|
or $t8,$ta0
|
|
.set noreorder
|
|
beqz $at,.L_bn_div_3_words_inner_loop
|
|
$SUBU $v0,1
|
|
$ADDU $v0,1
|
|
.set reorder
|
|
.L_bn_div_3_words_inner_loop_done:
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_div_3_words_internal
|
|
#endif
|
|
|
|
.align 5
|
|
.globl bn_div_words
|
|
.ent bn_div_words
|
|
bn_div_words:
|
|
.set noreorder
|
|
bnez $a2,bn_div_words_internal
|
|
li $v0,-1 # I would rather signal div-by-zero
|
|
# which can be done with 'break 7'
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_div_words
|
|
|
|
.align 5
|
|
.ent bn_div_words_internal
|
|
bn_div_words_internal:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
move $v1,$zero
|
|
bltz $a2,.L_bn_div_words_body
|
|
move $t9,$v1
|
|
$SLL $a2,1
|
|
bgtz $a2,.-4
|
|
addu $t9,1
|
|
|
|
.set reorder
|
|
negu $t1,$t9
|
|
li $t2,-1
|
|
$SLL $t2,$t1
|
|
and $t2,$a0
|
|
$SRL $at,$a1,$t1
|
|
.set noreorder
|
|
beqz $t2,.+12
|
|
nop
|
|
break 6 # signal overflow
|
|
.set reorder
|
|
$SLL $a0,$t9
|
|
$SLL $a1,$t9
|
|
or $a0,$at
|
|
___
|
|
$QT=$ta0;
|
|
$HH=$ta1;
|
|
$DH=$v1;
|
|
$code.=<<___;
|
|
.L_bn_div_words_body:
|
|
$SRL $DH,$a2,4*$BNSZ # bits
|
|
sgeu $at,$a0,$a2
|
|
.set noreorder
|
|
beqz $at,.+12
|
|
nop
|
|
$SUBU $a0,$a2
|
|
.set reorder
|
|
|
|
li $QT,-1
|
|
$SRL $HH,$a0,4*$BNSZ # bits
|
|
$SRL $QT,4*$BNSZ # q=0xffffffff
|
|
beq $DH,$HH,.L_bn_div_words_skip_div1
|
|
$DIVU ($a0,$DH)
|
|
mfqt ($QT,$a0,$DH)
|
|
.L_bn_div_words_skip_div1:
|
|
$MULTU ($a2,$QT)
|
|
$SLL $t3,$a0,4*$BNSZ # bits
|
|
$SRL $at,$a1,4*$BNSZ # bits
|
|
or $t3,$at
|
|
mflo ($t0,$a2,$QT)
|
|
mfhi ($t1,$a2,$QT)
|
|
.L_bn_div_words_inner_loop1:
|
|
sltu $t2,$t3,$t0
|
|
seq $t8,$HH,$t1
|
|
sltu $at,$HH,$t1
|
|
and $t2,$t8
|
|
sltu $v0,$t0,$a2
|
|
or $at,$t2
|
|
.set noreorder
|
|
beqz $at,.L_bn_div_words_inner_loop1_done
|
|
$SUBU $t1,$v0
|
|
$SUBU $t0,$a2
|
|
b .L_bn_div_words_inner_loop1
|
|
$SUBU $QT,1
|
|
.set reorder
|
|
.L_bn_div_words_inner_loop1_done:
|
|
|
|
$SLL $a1,4*$BNSZ # bits
|
|
$SUBU $a0,$t3,$t0
|
|
$SLL $v0,$QT,4*$BNSZ # bits
|
|
|
|
li $QT,-1
|
|
$SRL $HH,$a0,4*$BNSZ # bits
|
|
$SRL $QT,4*$BNSZ # q=0xffffffff
|
|
beq $DH,$HH,.L_bn_div_words_skip_div2
|
|
$DIVU ($a0,$DH)
|
|
mfqt ($QT,$a0,$DH)
|
|
.L_bn_div_words_skip_div2:
|
|
$MULTU ($a2,$QT)
|
|
$SLL $t3,$a0,4*$BNSZ # bits
|
|
$SRL $at,$a1,4*$BNSZ # bits
|
|
or $t3,$at
|
|
mflo ($t0,$a2,$QT)
|
|
mfhi ($t1,$a2,$QT)
|
|
.L_bn_div_words_inner_loop2:
|
|
sltu $t2,$t3,$t0
|
|
seq $t8,$HH,$t1
|
|
sltu $at,$HH,$t1
|
|
and $t2,$t8
|
|
sltu $v1,$t0,$a2
|
|
or $at,$t2
|
|
.set noreorder
|
|
beqz $at,.L_bn_div_words_inner_loop2_done
|
|
$SUBU $t1,$v1
|
|
$SUBU $t0,$a2
|
|
b .L_bn_div_words_inner_loop2
|
|
$SUBU $QT,1
|
|
.set reorder
|
|
.L_bn_div_words_inner_loop2_done:
|
|
|
|
$SUBU $a0,$t3,$t0
|
|
or $v0,$QT
|
|
$SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it
|
|
$SRL $a2,$t9 # restore $a2
|
|
|
|
.set noreorder
|
|
move $a1,$v1
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
move $a0,$v0
|
|
.end bn_div_words_internal
|
|
___
|
|
undef $HH; undef $QT; undef $DH;
|
|
|
|
($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
|
|
($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
|
|
|
|
($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
|
|
($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
|
|
|
|
($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
|
|
|
|
$code.=<<___;
|
|
|
|
.align 5
|
|
.globl bn_mul_comba8
|
|
.ent bn_mul_comba8
|
|
bn_mul_comba8:
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,12*$SZREG,$ra
|
|
.mask 0x803ff008,-$SZREG
|
|
$PTR_SUB $sp,12*$SZREG
|
|
$REG_S $ra,11*$SZREG($sp)
|
|
$REG_S $s5,10*$SZREG($sp)
|
|
$REG_S $s4,9*$SZREG($sp)
|
|
$REG_S $s3,8*$SZREG($sp)
|
|
$REG_S $s2,7*$SZREG($sp)
|
|
$REG_S $s1,6*$SZREG($sp)
|
|
$REG_S $s0,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___ if ($flavour !~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x003f0000,-$SZREG
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $s5,5*$SZREG($sp)
|
|
$REG_S $s4,4*$SZREG($sp)
|
|
$REG_S $s3,3*$SZREG($sp)
|
|
$REG_S $s2,2*$SZREG($sp)
|
|
$REG_S $s1,1*$SZREG($sp)
|
|
$REG_S $s0,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
|
|
.set reorder
|
|
$LD $a_0,0($a1) # If compiled with -mips3 option on
|
|
# R5000 box assembler barks on this
|
|
# 1ine with "should not have mult/div
|
|
# as last instruction in bb (R10K
|
|
# bug)" warning. If anybody out there
|
|
# has a clue about how to circumvent
|
|
# this do send me a note.
|
|
# <appro\@fy.chalmers.se>
|
|
|
|
$LD $b_0,0($a2)
|
|
$LD $a_1,$BNSZ($a1)
|
|
$LD $a_2,2*$BNSZ($a1)
|
|
$MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
|
|
$LD $a_3,3*$BNSZ($a1)
|
|
$LD $b_1,$BNSZ($a2)
|
|
$LD $b_2,2*$BNSZ($a2)
|
|
$LD $b_3,3*$BNSZ($a2)
|
|
mflo ($c_1,$a_0,$b_0)
|
|
mfhi ($c_2,$a_0,$b_0)
|
|
|
|
$LD $a_4,4*$BNSZ($a1)
|
|
$LD $a_5,5*$BNSZ($a1)
|
|
$MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
|
|
$LD $a_6,6*$BNSZ($a1)
|
|
$LD $a_7,7*$BNSZ($a1)
|
|
$LD $b_4,4*$BNSZ($a2)
|
|
$LD $b_5,5*$BNSZ($a2)
|
|
mflo ($t_1,$a_0,$b_1)
|
|
mfhi ($t_2,$a_0,$b_1)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
|
|
$ADDU $c_3,$t_2,$at
|
|
$LD $b_6,6*$BNSZ($a2)
|
|
$LD $b_7,7*$BNSZ($a2)
|
|
$ST $c_1,0($a0) # r[0]=c1;
|
|
mflo ($t_1,$a_1,$b_0)
|
|
mfhi ($t_2,$a_1,$b_0)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $c_1,$c_3,$t_2
|
|
$ST $c_2,$BNSZ($a0) # r[1]=c2;
|
|
|
|
mflo ($t_1,$a_2,$b_0)
|
|
mfhi ($t_2,$a_2,$b_0)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
mflo ($t_1,$a_1,$b_1)
|
|
mfhi ($t_2,$a_1,$b_1)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $c_2,$c_1,$t_2
|
|
mflo ($t_1,$a_0,$b_2)
|
|
mfhi ($t_2,$a_0,$b_2)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,2*$BNSZ($a0) # r[2]=c3;
|
|
|
|
mflo ($t_1,$a_0,$b_3)
|
|
mfhi ($t_2,$a_0,$b_3)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $c_3,$c_2,$t_2
|
|
mflo ($t_1,$a_1,$b_2)
|
|
mfhi ($t_2,$a_1,$b_2)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_2,$b_1)
|
|
mfhi ($t_2,$a_2,$b_1)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_3,$b_0)
|
|
mfhi ($t_2,$a_3,$b_0)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_4,$b_0) # mul_add_c(a[4],b[0],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
$ST $c_1,3*$BNSZ($a0) # r[3]=c1;
|
|
|
|
mflo ($t_1,$a_4,$b_0)
|
|
mfhi ($t_2,$a_4,$b_0)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $c_1,$c_3,$t_2
|
|
mflo ($t_1,$a_3,$b_1)
|
|
mfhi ($t_2,$a_3,$b_1)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_2,$b_2)
|
|
mfhi ($t_2,$a_2,$b_2)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_1,$b_3)
|
|
mfhi ($t_2,$a_1,$b_3)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_0,$b_4) # mul_add_c(a[0],b[4],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_0,$b_4)
|
|
mfhi ($t_2,$a_0,$b_4)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_0,$b_5) # mul_add_c(a[0],b[5],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
$ST $c_2,4*$BNSZ($a0) # r[4]=c2;
|
|
|
|
mflo ($t_1,$a_0,$b_5)
|
|
mfhi ($t_2,$a_0,$b_5)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_1,$b_4) # mul_add_c(a[1],b[4],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $c_2,$c_1,$t_2
|
|
mflo ($t_1,$a_1,$b_4)
|
|
mfhi ($t_2,$a_1,$b_4)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_2,$b_3)
|
|
mfhi ($t_2,$a_2,$b_3)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_3,$b_2)
|
|
mfhi ($t_2,$a_3,$b_2)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_4,$b_1) # mul_add_c(a[4],b[1],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_4,$b_1)
|
|
mfhi ($t_2,$a_4,$b_1)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_5,$b_0) # mul_add_c(a[5],b[0],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_5,$b_0)
|
|
mfhi ($t_2,$a_5,$b_0)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_6,$b_0) # mul_add_c(a[6],b[0],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,5*$BNSZ($a0) # r[5]=c3;
|
|
|
|
mflo ($t_1,$a_6,$b_0)
|
|
mfhi ($t_2,$a_6,$b_0)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_5,$b_1) # mul_add_c(a[5],b[1],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $c_3,$c_2,$t_2
|
|
mflo ($t_1,$a_5,$b_1)
|
|
mfhi ($t_2,$a_5,$b_1)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_4,$b_2) # mul_add_c(a[4],b[2],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_4,$b_2)
|
|
mfhi ($t_2,$a_4,$b_2)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_3,$b_3)
|
|
mfhi ($t_2,$a_3,$b_3)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_2,$b_4) # mul_add_c(a[2],b[4],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_2,$b_4)
|
|
mfhi ($t_2,$a_2,$b_4)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_1,$b_5) # mul_add_c(a[1],b[5],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_1,$b_5)
|
|
mfhi ($t_2,$a_1,$b_5)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_0,$b_6) # mul_add_c(a[0],b[6],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_0,$b_6)
|
|
mfhi ($t_2,$a_0,$b_6)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_0,$b_7) # mul_add_c(a[0],b[7],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
$ST $c_1,6*$BNSZ($a0) # r[6]=c1;
|
|
|
|
mflo ($t_1,$a_0,$b_7)
|
|
mfhi ($t_2,$a_0,$b_7)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_1,$b_6) # mul_add_c(a[1],b[6],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $c_1,$c_3,$t_2
|
|
mflo ($t_1,$a_1,$b_6)
|
|
mfhi ($t_2,$a_1,$b_6)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_2,$b_5) # mul_add_c(a[2],b[5],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_2,$b_5)
|
|
mfhi ($t_2,$a_2,$b_5)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_3,$b_4) # mul_add_c(a[3],b[4],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_3,$b_4)
|
|
mfhi ($t_2,$a_3,$b_4)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_4,$b_3) # mul_add_c(a[4],b[3],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_4,$b_3)
|
|
mfhi ($t_2,$a_4,$b_3)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_5,$b_2) # mul_add_c(a[5],b[2],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_5,$b_2)
|
|
mfhi ($t_2,$a_5,$b_2)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_6,$b_1) # mul_add_c(a[6],b[1],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_6,$b_1)
|
|
mfhi ($t_2,$a_6,$b_1)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_7,$b_0) # mul_add_c(a[7],b[0],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_7,$b_0)
|
|
mfhi ($t_2,$a_7,$b_0)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_7,$b_1) # mul_add_c(a[7],b[1],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
$ST $c_2,7*$BNSZ($a0) # r[7]=c2;
|
|
|
|
mflo ($t_1,$a_7,$b_1)
|
|
mfhi ($t_2,$a_7,$b_1)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_6,$b_2) # mul_add_c(a[6],b[2],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $c_2,$c_1,$t_2
|
|
mflo ($t_1,$a_6,$b_2)
|
|
mfhi ($t_2,$a_6,$b_2)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_5,$b_3) # mul_add_c(a[5],b[3],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_5,$b_3)
|
|
mfhi ($t_2,$a_5,$b_3)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_4,$b_4) # mul_add_c(a[4],b[4],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_4,$b_4)
|
|
mfhi ($t_2,$a_4,$b_4)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_3,$b_5) # mul_add_c(a[3],b[5],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_3,$b_5)
|
|
mfhi ($t_2,$a_3,$b_5)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_2,$b_6) # mul_add_c(a[2],b[6],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_2,$b_6)
|
|
mfhi ($t_2,$a_2,$b_6)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_1,$b_7) # mul_add_c(a[1],b[7],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_1,$b_7)
|
|
mfhi ($t_2,$a_1,$b_7)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_2,$b_7) # mul_add_c(a[2],b[7],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,8*$BNSZ($a0) # r[8]=c3;
|
|
|
|
mflo ($t_1,$a_2,$b_7)
|
|
mfhi ($t_2,$a_2,$b_7)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_3,$b_6) # mul_add_c(a[3],b[6],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $c_3,$c_2,$t_2
|
|
mflo ($t_1,$a_3,$b_6)
|
|
mfhi ($t_2,$a_3,$b_6)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_4,$b_5) # mul_add_c(a[4],b[5],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_4,$b_5)
|
|
mfhi ($t_2,$a_4,$b_5)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_5,$b_4) # mul_add_c(a[5],b[4],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_5,$b_4)
|
|
mfhi ($t_2,$a_5,$b_4)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_6,$b_3) # mul_add_c(a[6],b[3],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_6,$b_3)
|
|
mfhi ($t_2,$a_6,$b_3)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_7,$b_2) # mul_add_c(a[7],b[2],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_7,$b_2)
|
|
mfhi ($t_2,$a_7,$b_2)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_7,$b_3) # mul_add_c(a[7],b[3],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
$ST $c_1,9*$BNSZ($a0) # r[9]=c1;
|
|
|
|
mflo ($t_1,$a_7,$b_3)
|
|
mfhi ($t_2,$a_7,$b_3)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_6,$b_4) # mul_add_c(a[6],b[4],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $c_1,$c_3,$t_2
|
|
mflo ($t_1,$a_6,$b_4)
|
|
mfhi ($t_2,$a_6,$b_4)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_5,$b_5) # mul_add_c(a[5],b[5],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_5,$b_5)
|
|
mfhi ($t_2,$a_5,$b_5)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_4,$b_6) # mul_add_c(a[4],b[6],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_4,$b_6)
|
|
mfhi ($t_2,$a_4,$b_6)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_3,$b_7) # mul_add_c(a[3],b[7],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_3,$b_7)
|
|
mfhi ($t_2,$a_3,$b_7)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_4,$b_7) # mul_add_c(a[4],b[7],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
$ST $c_2,10*$BNSZ($a0) # r[10]=c2;
|
|
|
|
mflo ($t_1,$a_4,$b_7)
|
|
mfhi ($t_2,$a_4,$b_7)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_5,$b_6) # mul_add_c(a[5],b[6],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $c_2,$c_1,$t_2
|
|
mflo ($t_1,$a_5,$b_6)
|
|
mfhi ($t_2,$a_5,$b_6)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_6,$b_5) # mul_add_c(a[6],b[5],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_6,$b_5)
|
|
mfhi ($t_2,$a_6,$b_5)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_7,$b_4) # mul_add_c(a[7],b[4],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
mflo ($t_1,$a_7,$b_4)
|
|
mfhi ($t_2,$a_7,$b_4)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_7,$b_5) # mul_add_c(a[7],b[5],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,11*$BNSZ($a0) # r[11]=c3;
|
|
|
|
mflo ($t_1,$a_7,$b_5)
|
|
mfhi ($t_2,$a_7,$b_5)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_6,$b_6) # mul_add_c(a[6],b[6],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $c_3,$c_2,$t_2
|
|
mflo ($t_1,$a_6,$b_6)
|
|
mfhi ($t_2,$a_6,$b_6)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_5,$b_7) # mul_add_c(a[5],b[7],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_5,$b_7)
|
|
mfhi ($t_2,$a_5,$b_7)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_6,$b_7) # mul_add_c(a[6],b[7],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
$ST $c_1,12*$BNSZ($a0) # r[12]=c1;
|
|
|
|
mflo ($t_1,$a_6,$b_7)
|
|
mfhi ($t_2,$a_6,$b_7)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_7,$b_6) # mul_add_c(a[7],b[6],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $c_1,$c_3,$t_2
|
|
mflo ($t_1,$a_7,$b_6)
|
|
mfhi ($t_2,$a_7,$b_6)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_7,$b_7) # mul_add_c(a[7],b[7],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
$ST $c_2,13*$BNSZ($a0) # r[13]=c2;
|
|
|
|
mflo ($t_1,$a_7,$b_7)
|
|
mfhi ($t_2,$a_7,$b_7)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
$ST $c_3,14*$BNSZ($a0) # r[14]=c3;
|
|
$ST $c_1,15*$BNSZ($a0) # r[15]=c1;
|
|
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $s5,10*$SZREG($sp)
|
|
$REG_L $s4,9*$SZREG($sp)
|
|
$REG_L $s3,8*$SZREG($sp)
|
|
$REG_L $s2,7*$SZREG($sp)
|
|
$REG_L $s1,6*$SZREG($sp)
|
|
$REG_L $s0,5*$SZREG($sp)
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
jr $ra
|
|
$PTR_ADD $sp,12*$SZREG
|
|
___
|
|
$code.=<<___ if ($flavour !~ /nubi/i);
|
|
$REG_L $s5,5*$SZREG($sp)
|
|
$REG_L $s4,4*$SZREG($sp)
|
|
$REG_L $s3,3*$SZREG($sp)
|
|
$REG_L $s2,2*$SZREG($sp)
|
|
$REG_L $s1,1*$SZREG($sp)
|
|
$REG_L $s0,0*$SZREG($sp)
|
|
jr $ra
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
.end bn_mul_comba8
|
|
|
|
.align 5
|
|
.globl bn_mul_comba4
|
|
.ent bn_mul_comba4
|
|
bn_mul_comba4:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
$LD $a_0,0($a1)
|
|
$LD $b_0,0($a2)
|
|
$LD $a_1,$BNSZ($a1)
|
|
$LD $a_2,2*$BNSZ($a1)
|
|
$MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3);
|
|
$LD $a_3,3*$BNSZ($a1)
|
|
$LD $b_1,$BNSZ($a2)
|
|
$LD $b_2,2*$BNSZ($a2)
|
|
$LD $b_3,3*$BNSZ($a2)
|
|
mflo ($c_1,$a_0,$b_0)
|
|
mfhi ($c_2,$a_0,$b_0)
|
|
$ST $c_1,0($a0)
|
|
|
|
$MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1);
|
|
mflo ($t_1,$a_0,$b_1)
|
|
mfhi ($t_2,$a_0,$b_1)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1);
|
|
$ADDU $c_3,$t_2,$at
|
|
mflo ($t_1,$a_1,$b_0)
|
|
mfhi ($t_2,$a_1,$b_0)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $c_1,$c_3,$t_2
|
|
$ST $c_2,$BNSZ($a0)
|
|
|
|
mflo ($t_1,$a_2,$b_0)
|
|
mfhi ($t_2,$a_2,$b_0)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
mflo ($t_1,$a_1,$b_1)
|
|
mfhi ($t_2,$a_1,$b_1)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $c_2,$c_1,$t_2
|
|
mflo ($t_1,$a_0,$b_2)
|
|
mfhi ($t_2,$a_0,$b_2)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,2*$BNSZ($a0)
|
|
|
|
mflo ($t_1,$a_0,$b_3)
|
|
mfhi ($t_2,$a_0,$b_3)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $c_3,$c_2,$t_2
|
|
mflo ($t_1,$a_1,$b_2)
|
|
mfhi ($t_2,$a_1,$b_2)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_2,$b_1)
|
|
mfhi ($t_2,$a_2,$b_1)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
mflo ($t_1,$a_3,$b_0)
|
|
mfhi ($t_2,$a_3,$b_0)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
$ST $c_1,3*$BNSZ($a0)
|
|
|
|
mflo ($t_1,$a_3,$b_1)
|
|
mfhi ($t_2,$a_3,$b_1)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $c_1,$c_3,$t_2
|
|
mflo ($t_1,$a_2,$b_2)
|
|
mfhi ($t_2,$a_2,$b_2)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
mflo ($t_1,$a_1,$b_3)
|
|
mfhi ($t_2,$a_1,$b_3)
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
$ST $c_2,4*$BNSZ($a0)
|
|
|
|
mflo ($t_1,$a_2,$b_3)
|
|
mfhi ($t_2,$a_2,$b_3)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $c_2,$c_1,$t_2
|
|
mflo ($t_1,$a_3,$b_2)
|
|
mfhi ($t_2,$a_3,$b_2)
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,5*$BNSZ($a0)
|
|
|
|
mflo ($t_1,$a_3,$b_3)
|
|
mfhi ($t_2,$a_3,$b_3)
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
$ST $c_1,6*$BNSZ($a0)
|
|
$ST $c_2,7*$BNSZ($a0)
|
|
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
nop
|
|
.end bn_mul_comba4
|
|
___
|
|
|
|
($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
|
|
|
|
sub add_c2 () {
|
|
my ($hi,$lo,$c0,$c1,$c2,
|
|
$warm, # !$warm denotes first call with specific sequence of
|
|
# $c_[XYZ] when there is no Z-carry to accumulate yet;
|
|
$an,$bn # these two are arguments for multiplication which
|
|
# result is used in *next* step [which is why it's
|
|
# commented as "forward multiplication" below];
|
|
)=@_;
|
|
$code.=<<___;
|
|
$ADDU $c0,$lo
|
|
sltu $at,$c0,$lo
|
|
$MULTU ($an,$bn) # forward multiplication
|
|
$ADDU $c0,$lo
|
|
$ADDU $at,$hi
|
|
sltu $lo,$c0,$lo
|
|
$ADDU $c1,$at
|
|
$ADDU $hi,$lo
|
|
___
|
|
$code.=<<___ if (!$warm);
|
|
sltu $c2,$c1,$at
|
|
$ADDU $c1,$hi
|
|
___
|
|
$code.=<<___ if ($warm);
|
|
sltu $at,$c1,$at
|
|
$ADDU $c1,$hi
|
|
$ADDU $c2,$at
|
|
___
|
|
$code.=<<___;
|
|
sltu $hi,$c1,$hi
|
|
$ADDU $c2,$hi
|
|
mflo ($lo,$an,$bn)
|
|
mfhi ($hi,$an,$bn)
|
|
___
|
|
}
|
|
|
|
$code.=<<___;
|
|
|
|
.align 5
|
|
.globl bn_sqr_comba8
|
|
.ent bn_sqr_comba8
|
|
bn_sqr_comba8:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
$LD $a_0,0($a1)
|
|
$LD $a_1,$BNSZ($a1)
|
|
$LD $a_2,2*$BNSZ($a1)
|
|
$LD $a_3,3*$BNSZ($a1)
|
|
|
|
$MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
|
|
$LD $a_4,4*$BNSZ($a1)
|
|
$LD $a_5,5*$BNSZ($a1)
|
|
$LD $a_6,6*$BNSZ($a1)
|
|
$LD $a_7,7*$BNSZ($a1)
|
|
mflo ($c_1,$a_0,$a_0)
|
|
mfhi ($c_2,$a_0,$a_0)
|
|
$ST $c_1,0($a0)
|
|
|
|
$MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
|
|
mflo ($t_1,$a_0,$a_1)
|
|
mfhi ($t_2,$a_0,$a_1)
|
|
slt $c_1,$t_2,$zero
|
|
$SLL $t_2,1
|
|
$MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
|
|
slt $a2,$t_1,$zero
|
|
$ADDU $t_2,$a2
|
|
$SLL $t_1,1
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$ADDU $c_3,$t_2,$at
|
|
$ST $c_2,$BNSZ($a0)
|
|
mflo ($t_1,$a_2,$a_0)
|
|
mfhi ($t_2,$a_2,$a_0)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
|
|
$a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
|
|
$code.=<<___;
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,2*$BNSZ($a0)
|
|
mflo ($t_1,$a_0,$a_3)
|
|
mfhi ($t_2,$a_0,$a_3)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
|
|
$a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3);
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
|
|
$a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1);
|
|
$code.=<<___;
|
|
$ST $c_1,3*$BNSZ($a0)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
|
|
$a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
|
|
$a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
|
|
$code.=<<___;
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_0,$a_5) # mul_add_c2(a[0],b[5],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
$ST $c_2,4*$BNSZ($a0)
|
|
mflo ($t_1,$a_0,$a_5)
|
|
mfhi ($t_2,$a_0,$a_5)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
|
|
$a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2);
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
|
|
$a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2);
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
|
|
$a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3);
|
|
$code.=<<___;
|
|
$ST $c_3,5*$BNSZ($a0)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
|
|
$a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3);
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
|
|
$a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3);
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
|
|
$a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
|
|
$code.=<<___;
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_0,$a_7) # mul_add_c2(a[0],b[7],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
$ST $c_1,6*$BNSZ($a0)
|
|
mflo ($t_1,$a_0,$a_7)
|
|
mfhi ($t_2,$a_0,$a_7)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
|
|
$a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1);
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
|
|
$a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1);
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
|
|
$a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1);
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
|
|
$a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2);
|
|
$code.=<<___;
|
|
$ST $c_2,7*$BNSZ($a0)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
|
|
$a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2);
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
|
|
$a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2);
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
|
|
$a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2);
|
|
$code.=<<___;
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_2,$a_7) # mul_add_c2(a[2],b[7],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,8*$BNSZ($a0)
|
|
mflo ($t_1,$a_2,$a_7)
|
|
mfhi ($t_2,$a_2,$a_7)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
|
|
$a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3);
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
|
|
$a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3);
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
|
|
$a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1);
|
|
$code.=<<___;
|
|
$ST $c_1,9*$BNSZ($a0)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
|
|
$a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1);
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1,
|
|
$a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1);
|
|
$code.=<<___;
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_4,$a_7) # mul_add_c2(a[4],b[7],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
$ST $c_2,10*$BNSZ($a0)
|
|
mflo ($t_1,$a_4,$a_7)
|
|
mfhi ($t_2,$a_4,$a_7)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
|
|
$a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2);
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1,
|
|
$a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3);
|
|
$code.=<<___;
|
|
$ST $c_3,11*$BNSZ($a0)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
|
|
$a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3);
|
|
$code.=<<___;
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$MULTU ($a_6,$a_7) # mul_add_c2(a[6],b[7],c2,c3,c1);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
sltu $at,$c_2,$t_2
|
|
$ADDU $c_3,$at
|
|
$ST $c_1,12*$BNSZ($a0)
|
|
mflo ($t_1,$a_6,$a_7)
|
|
mfhi ($t_2,$a_6,$a_7)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
|
|
$a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2);
|
|
$code.=<<___;
|
|
$ST $c_2,13*$BNSZ($a0)
|
|
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
$ST $c_3,14*$BNSZ($a0)
|
|
$ST $c_1,15*$BNSZ($a0)
|
|
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
nop
|
|
.end bn_sqr_comba8
|
|
|
|
.align 5
|
|
.globl bn_sqr_comba4
|
|
.ent bn_sqr_comba4
|
|
bn_sqr_comba4:
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
.frame $sp,6*$SZREG,$ra
|
|
.mask 0x8000f008,-$SZREG
|
|
.set noreorder
|
|
$PTR_SUB $sp,6*$SZREG
|
|
$REG_S $ra,5*$SZREG($sp)
|
|
$REG_S $t3,4*$SZREG($sp)
|
|
$REG_S $t2,3*$SZREG($sp)
|
|
$REG_S $t1,2*$SZREG($sp)
|
|
$REG_S $t0,1*$SZREG($sp)
|
|
$REG_S $gp,0*$SZREG($sp)
|
|
___
|
|
$code.=<<___;
|
|
.set reorder
|
|
$LD $a_0,0($a1)
|
|
$LD $a_1,$BNSZ($a1)
|
|
$MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3);
|
|
$LD $a_2,2*$BNSZ($a1)
|
|
$LD $a_3,3*$BNSZ($a1)
|
|
mflo ($c_1,$a_0,$a_0)
|
|
mfhi ($c_2,$a_0,$a_0)
|
|
$ST $c_1,0($a0)
|
|
|
|
$MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1);
|
|
mflo ($t_1,$a_0,$a_1)
|
|
mfhi ($t_2,$a_0,$a_1)
|
|
slt $c_1,$t_2,$zero
|
|
$SLL $t_2,1
|
|
$MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2);
|
|
slt $a2,$t_1,$zero
|
|
$ADDU $t_2,$a2
|
|
$SLL $t_1,1
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$ADDU $c_3,$t_2,$at
|
|
$ST $c_2,$BNSZ($a0)
|
|
mflo ($t_1,$a_2,$a_0)
|
|
mfhi ($t_2,$a_2,$a_0)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
|
|
$a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2);
|
|
$code.=<<___;
|
|
$ADDU $c_3,$t_1
|
|
sltu $at,$c_3,$t_1
|
|
$MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_1,$t_2
|
|
sltu $at,$c_1,$t_2
|
|
$ADDU $c_2,$at
|
|
$ST $c_3,2*$BNSZ($a0)
|
|
mflo ($t_1,$a_0,$a_3)
|
|
mfhi ($t_2,$a_0,$a_3)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0,
|
|
$a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3);
|
|
&add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1,
|
|
$a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1);
|
|
$code.=<<___;
|
|
$ST $c_1,3*$BNSZ($a0)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0,
|
|
$a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1);
|
|
$code.=<<___;
|
|
$ADDU $c_2,$t_1
|
|
sltu $at,$c_2,$t_1
|
|
$MULTU ($a_2,$a_3) # mul_add_c2(a[2],b[3],c3,c1,c2);
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_3,$t_2
|
|
sltu $at,$c_3,$t_2
|
|
$ADDU $c_1,$at
|
|
$ST $c_2,4*$BNSZ($a0)
|
|
mflo ($t_1,$a_2,$a_3)
|
|
mfhi ($t_2,$a_2,$a_3)
|
|
___
|
|
&add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0,
|
|
$a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3);
|
|
$code.=<<___;
|
|
$ST $c_3,5*$BNSZ($a0)
|
|
|
|
$ADDU $c_1,$t_1
|
|
sltu $at,$c_1,$t_1
|
|
$ADDU $t_2,$at
|
|
$ADDU $c_2,$t_2
|
|
$ST $c_1,6*$BNSZ($a0)
|
|
$ST $c_2,7*$BNSZ($a0)
|
|
|
|
.set noreorder
|
|
___
|
|
$code.=<<___ if ($flavour =~ /nubi/i);
|
|
$REG_L $t3,4*$SZREG($sp)
|
|
$REG_L $t2,3*$SZREG($sp)
|
|
$REG_L $t1,2*$SZREG($sp)
|
|
$REG_L $t0,1*$SZREG($sp)
|
|
$REG_L $gp,0*$SZREG($sp)
|
|
$PTR_ADD $sp,6*$SZREG
|
|
___
|
|
$code.=<<___;
|
|
jr $ra
|
|
nop
|
|
.end bn_sqr_comba4
|
|
___
|
|
print $code;
|
|
close STDOUT;
|