Sparc v8plus assembler.

Submitted by: Andy Polyakov <appro@fy.chalmers.se>
This commit is contained in:
Ulf Möller 1999-05-04 20:35:18 +00:00
parent d872c55c20
commit 4f5fac8011
5 changed files with 1692 additions and 127 deletions

View File

@ -5,6 +5,9 @@
Changes between 0.9.2b and 0.9.3
*) Sparc v8plus assembler for the bignum library.
[Andy Polyakov <appro@fy.chalmers.se>]
*) Accept any -xxx and +xxx compiler options in Configure.
[Ulf Möller]

View File

@ -115,8 +115,8 @@ my %table=(
# Don't use -xtarget=ultra with SC4.2. It is broken, and will break exptest.
# SC5.0 with the compiler common patch works.
"solaris-sparc-sc4","cc:-xarch=v8 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::",
"solaris-usparc-sc4","cc:-xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::",
"solaris-usparc-sc5","cc:-xtarget=ultra -xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::",
"solaris-usparc-sc4","cc:-xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o::",
"solaris-usparc-sc5","cc:-xtarget=ultra -xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o::",
# Sunos configs, assuming sparc for the gcc one.
##"sunos-cc", "cc:-O4 -DNOPROTO -DNOCONST:(unknown)::DES_UNROLL:::",

View File

@ -101,6 +101,9 @@ asm/co86unix.cpp: asm/co-586.pl
asm/sparcv8.o: asm/sparcv8.S
$(CC) -c -o asm/sparcv8.o asm/sparcv8.S
asm/sparcv8plus: asm/sparcv8plus.S
$(CC) -c -xarch=v8plus -o asm/sparcv8plus.o asm/sparcv8plus.S
# MIPS 64 bit assember
asm/mips3.o: asm/mips3.s
/usr/bin/as -mips3 -O2 -o asm/mips3.o asm/mips3.s

View File

@ -1,4 +1,4 @@
.ident "sparcv8.s, Version 1.1"
.ident "sparcv8.s, Version 1.2"
.ident "SPARC v8 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
/*
@ -24,14 +24,14 @@
/*
* Revision history.
*
* 1.1 - new loop unrolling model(*)
* - 10% performance boost(*)
* 1.1 - new loop unrolling model(*);
* 1.2 - made gas friendly;
*
* (*) see bn_asm.sparc.v8plus.S for details
*/
.section ".text",#alloc,#execinstr
.file "sparcv8.s"
.file "bn_asm.sparc.v8.S"
.align 32
@ -546,48 +546,38 @@ bn_sub_words:
.type bn_sub_words,#function
.size bn_sub_words,(.-bn_sub_words)
#define FRAME_SIZE -96
#define FRAME_SIZE -96
/*
* Here is register usage map for *all* routines below.
*/
#define a_0 %l0
#define a_0_ [%i1]
#define a_1 %l1
#define a_1_ [%i1+4]
#define a_2 %l2
#define a_2_ [%i1+8]
#define a_3 %l3
#define a_3_ [%i1+12]
#define a_4 %l4
#define a_4_ [%i1+16]
#define a_5 %l5
#define a_5_ [%i1+20]
#define a_6 %l6
#define a_6_ [%i1+24]
#define a_7 %l7
#define a_7_ [%i1+28]
#define b_0 %g1
#define b_0_ [%i2]
#define b_1 %g2
#define b_1_ [%i2+4]
#define b_2 %g3
#define b_2_ [%i2+8]
#define b_3 %g4
#define b_3_ [%i2+12]
#define b_4 %i3
#define b_4_ [%i2+16]
#define b_5 %i4
#define b_5_ [%i2+20]
#define b_6 %i5
#define b_6_ [%i2+24]
#define b_7 %o5
#define b_7_ [%i2+28]
#define t_1 %o0
#define t_2 %o1
#define c_1 %o2
#define c_2 %o3
#define c_3 %o4
#define t_1 %o0
#define t_2 %o1
#define a(I) [%i1+4*I]
#define b(I) [%i2+4*I]
#define r(I) [%i0+4*I]
#define a_0 %l0
#define a_1 %l1
#define a_2 %l2
#define a_3 %l3
#define a_4 %l4
#define a_5 %l5
#define a_6 %l6
#define a_7 %l7
#define b_0 %i3
#define b_1 %i4
#define b_2 %i5
#define b_3 %o5
#define b_4 %g1
#define b_5 %g2
#define b_6 %g3
#define b_7 %g4
.align 32
.global bn_mul_comba8
@ -597,25 +587,25 @@ bn_sub_words:
*/
bn_mul_comba8:
save %sp,FRAME_SIZE,%sp
ld a_0_,a_0
ld b_0_,b_0
ld a(0),a_0
ld b(0),b_0
umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3);
ld b_1_,b_1
ld b(1),b_1
rd %y,c_2
st c_1,[%i0] !r[0]=c1;
st c_1,r(0) !r[0]=c1;
umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1);
ld a_1_,a_1
ld a(1),a_1
addcc c_2,t_1,c_2
rd %y,t_2
addxcc %g0,t_2,c_3 !=
addx %g0,%g0,c_1
ld a_2_,a_2
ld a(2),a_2
umul a_1,b_0,t_1 !mul_add_c(a[1],b[0],c2,c3,c1);
addcc c_2,t_1,c_2 !=
rd %y,t_2
addxcc c_3,t_2,c_3
st c_2,[%i0+4] !r[1]=c2;
st c_2,r(1) !r[1]=c2;
addx c_1,%g0,c_1 !=
umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
@ -623,19 +613,19 @@ bn_mul_comba8:
rd %y,t_2
addxcc c_1,t_2,c_1 !=
addx %g0,%g0,c_2
ld b_2_,b_2
ld b(2),b_2
umul a_1,b_1,t_1 !mul_add_c(a[1],b[1],c3,c1,c2);
addcc c_3,t_1,c_3 !=
rd %y,t_2
addxcc c_1,t_2,c_1
ld b_3_,b_3
ld b(3),b_3
addx c_2,%g0,c_2 !=
umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
addcc c_3,t_1,c_3
rd %y,t_2
addxcc c_1,t_2,c_1 !=
addx c_2,%g0,c_2
st c_3,[%i0+8] !r[2]=c3;
st c_3,r(2) !r[2]=c3;
umul a_0,b_3,t_1 !mul_add_c(a[0],b[3],c1,c2,c3);
addcc c_1,t_1,c_1 !=
@ -647,19 +637,19 @@ bn_mul_comba8:
rd %y,t_2
addxcc c_2,t_2,c_2
addx c_3,%g0,c_3 !=
ld a_3_,a_3
ld a(3),a_3
umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
addcc c_1,t_1,c_1
rd %y,t_2 !=
addxcc c_2,t_2,c_2
addx c_3,%g0,c_3
ld a_4_,a_4
ld a(4),a_4
umul a_3,b_0,t_1 !mul_add_c(a[3],b[0],c1,c2,c3);!=
addcc c_1,t_1,c_1
rd %y,t_2
addxcc c_2,t_2,c_2
addx c_3,%g0,c_3 !=
st c_1,[%i0+12] !r[3]=c1;
st c_1,r(3) !r[3]=c1;
umul a_4,b_0,t_1 !mul_add_c(a[4],b[0],c2,c3,c1);
addcc c_2,t_1,c_2
@ -676,19 +666,19 @@ bn_mul_comba8:
rd %y,t_2
addxcc c_3,t_2,c_3
addx c_1,%g0,c_1 !=
ld b_4_,b_4
ld b(4),b_4
umul a_1,b_3,t_1 !mul_add_c(a[1],b[3],c2,c3,c1);
addcc c_2,t_1,c_2
rd %y,t_2 !=
addxcc c_3,t_2,c_3
addx c_1,%g0,c_1
ld b_5_,b_5
ld b(5),b_5
umul a_0,b_4,t_1 !=!mul_add_c(a[0],b[4],c2,c3,c1);
addcc c_2,t_1,c_2
rd %y,t_2
addxcc c_3,t_2,c_3
addx c_1,%g0,c_1 !=
st c_2,[%i0+16] !r[4]=c2;
st c_2,r(4) !r[4]=c2;
umul a_0,b_5,t_1 !mul_add_c(a[0],b[5],c3,c1,c2);
addcc c_3,t_1,c_3
@ -710,19 +700,19 @@ bn_mul_comba8:
rd %y,t_2
addxcc c_1,t_2,c_1 !=
addx c_2,%g0,c_2
ld a_5_,a_5
ld a(5),a_5
umul a_4,b_1,t_1 !mul_add_c(a[4],b[1],c3,c1,c2);
addcc c_3,t_1,c_3 !=
rd %y,t_2
addxcc c_1,t_2,c_1
ld a_6_,a_6
ld a(6),a_6
addx c_2,%g0,c_2 !=
umul a_5,b_0,t_1 !mul_add_c(a[5],b[0],c3,c1,c2);
addcc c_3,t_1,c_3
rd %y,t_2
addxcc c_1,t_2,c_1 !=
addx c_2,%g0,c_2
st c_3,[%i0+20] !r[5]=c3;
st c_3,r(5) !r[5]=c3;
umul a_6,b_0,t_1 !mul_add_c(a[6],b[0],c1,c2,c3);
addcc c_1,t_1,c_1 !=
@ -748,19 +738,19 @@ bn_mul_comba8:
addcc c_1,t_1,c_1 !=
rd %y,t_2
addxcc c_2,t_2,c_2
ld b_6_,b_6
ld b(6),b_6
addx c_3,%g0,c_3 !=
umul a_1,b_5,t_1 !mul_add_c(a[1],b[5],c1,c2,c3);
addcc c_1,t_1,c_1
rd %y,t_2
addxcc c_2,t_2,c_2 !=
addx c_3,%g0,c_3
ld b_7_,b_7
ld b(7),b_7
umul a_0,b_6,t_1 !mul_add_c(a[0],b[6],c1,c2,c3);
addcc c_1,t_1,c_1 !=
rd %y,t_2
addxcc c_2,t_2,c_2
st c_1,[%i0+24] !r[6]=c1;
st c_1,r(6) !r[6]=c1;
addx c_3,%g0,c_3 !=
umul a_0,b_7,t_1 !mul_add_c(a[0],b[7],c2,c3,c1);
@ -793,7 +783,7 @@ bn_mul_comba8:
rd %y,t_2 !=
addxcc c_3,t_2,c_3
addx c_1,%g0,c_1
ld a_7_,a_7
ld a(7),a_7
umul a_6,b_1,t_1 !=!mul_add_c(a[6],b[1],c2,c3,c1);
addcc c_2,t_1,c_2
rd %y,t_2
@ -804,7 +794,7 @@ bn_mul_comba8:
rd %y,t_2
addxcc c_3,t_2,c_3 !=
addx c_1,%g0,c_1
st c_2,[%i0+28] !r[7]=c2;
st c_2,r(7) !r[7]=c2;
umul a_7,b_1,t_1 !mul_add_c(a[7],b[1],c3,c1,c2);
addcc c_3,t_1,c_3 !=
@ -841,7 +831,7 @@ bn_mul_comba8:
rd %y,t_2
addxcc c_1,t_2,c_1 !
addx c_2,%g0,c_2
st c_3,[%i0+32] !r[8]=c3;
st c_3,r(8) !r[8]=c3;
umul a_2,b_7,t_1 !mul_add_c(a[2],b[7],c1,c2,c3);
addcc c_1,t_1,c_1 !=
@ -873,7 +863,7 @@ bn_mul_comba8:
rd %y,t_2
addxcc c_2,t_2,c_2
addx c_3,%g0,c_3 !=
st c_1,[%i0+36] !r[9]=c1;
st c_1,r(9) !r[9]=c1;
umul a_7,b_3,t_1 !mul_add_c(a[7],b[3],c2,c3,c1);
addcc c_2,t_1,c_2
@ -900,7 +890,7 @@ bn_mul_comba8:
rd %y,t_2 !=
addxcc c_3,t_2,c_3
addx c_1,%g0,c_1
st c_2,[%i0+40] !r[10]=c2;
st c_2,r(10) !r[10]=c2;
umul a_4,b_7,t_1 !=!mul_add_c(a[4],b[7],c3,c1,c2);
addcc c_3,t_1,c_3
@ -921,7 +911,7 @@ bn_mul_comba8:
addcc c_3,t_1,c_3 !=
rd %y,t_2
addxcc c_1,t_2,c_1
st c_3,[%i0+44] !r[11]=c3;
st c_3,r(11) !r[11]=c3;
addx c_2,%g0,c_2 !=
umul a_7,b_5,t_1 !mul_add_c(a[7],b[5],c1,c2,c3);
@ -938,7 +928,7 @@ bn_mul_comba8:
addcc c_1,t_1,c_1 !=
rd %y,t_2
addxcc c_2,t_2,c_2
st c_1,[%i0+48] !r[12]=c1;
st c_1,r(12) !r[12]=c1;
addx c_3,%g0,c_3 !=
umul a_6,b_7,t_1 !mul_add_c(a[6],b[7],c2,c3,c1);
@ -951,15 +941,15 @@ bn_mul_comba8:
rd %y,t_2 !=
addxcc c_3,t_2,c_3
addx c_1,%g0,c_1
st c_2,[%i0+52] !r[13]=c2;
st c_2,r(13) !r[13]=c2;
umul a_7,b_7,t_1 !=!mul_add_c(a[7],b[7],c3,c1,c2);
addcc c_3,t_1,c_3
rd %y,t_2
addxcc c_1,t_2,c_1
nop !=
st c_3,[%i0+56] !r[14]=c3;
st c_1,[%i0+60] !r[15]=c1;
st c_3,r(14) !r[14]=c3;
st c_1,r(15) !r[15]=c1;
ret
restore %g0,%g0,%o0
@ -976,45 +966,45 @@ bn_mul_comba8:
*/
bn_mul_comba4:
save %sp,FRAME_SIZE,%sp
ld a_0_,a_0
ld b_0_,b_0
ld a(0),a_0
ld b(0),b_0
umul a_0,b_0,c_1 !=!mul_add_c(a[0],b[0],c1,c2,c3);
ld b_1_,b_1
ld b(1),b_1
rd %y,c_2
st c_1,[%i0] !r[0]=c1;
st c_1,r(0) !r[0]=c1;
umul a_0,b_1,t_1 !=!mul_add_c(a[0],b[1],c2,c3,c1);
ld a_1_,a_1
ld a(1),a_1
addcc c_2,t_1,c_2
rd %y,t_2 !=
addxcc %g0,t_2,c_3
addx %g0,%g0,c_1
ld a_2_,a_2
ld a(2),a_2
umul a_1,b_0,t_1 !=!mul_add_c(a[1],b[0],c2,c3,c1);
addcc c_2,t_1,c_2
rd %y,t_2
addxcc c_3,t_2,c_3
addx c_1,%g0,c_1 !=
st c_2,[%i0+4] !r[1]=c2;
st c_2,r(1) !r[1]=c2;
umul a_2,b_0,t_1 !mul_add_c(a[2],b[0],c3,c1,c2);
addcc c_3,t_1,c_3
rd %y,t_2 !=
addxcc c_1,t_2,c_1
addx %g0,%g0,c_2
ld b_2_,b_2
ld b(2),b_2
umul a_1,b_1,t_1 !=!mul_add_c(a[1],b[1],c3,c1,c2);
addcc c_3,t_1,c_3
rd %y,t_2
addxcc c_1,t_2,c_1
addx c_2,%g0,c_2 !=
ld b_3_,b_3
ld b(3),b_3
umul a_0,b_2,t_1 !mul_add_c(a[0],b[2],c3,c1,c2);
addcc c_3,t_1,c_3
rd %y,t_2 !=
addxcc c_1,t_2,c_1
addx c_2,%g0,c_2
st c_3,[%i0+8] !r[2]=c3;
st c_3,r(2) !r[2]=c3;
umul a_0,b_3,t_1 !=!mul_add_c(a[0],b[3],c1,c2,c3);
addcc c_1,t_1,c_1
@ -1026,7 +1016,7 @@ bn_mul_comba4:
rd %y,t_2
addxcc c_2,t_2,c_2 !=
addx c_3,%g0,c_3
ld a_3_,a_3
ld a(3),a_3
umul a_2,b_1,t_1 !mul_add_c(a[2],b[1],c1,c2,c3);
addcc c_1,t_1,c_1 !=
rd %y,t_2
@ -1037,7 +1027,7 @@ bn_mul_comba4:
rd %y,t_2
addxcc c_2,t_2,c_2
addx c_3,%g0,c_3 !=
st c_1,[%i0+12] !r[3]=c1;
st c_1,r(3) !r[3]=c1;
umul a_3,b_1,t_1 !mul_add_c(a[3],b[1],c2,c3,c1);
addcc c_2,t_1,c_2
@ -1054,7 +1044,7 @@ bn_mul_comba4:
rd %y,t_2
addxcc c_3,t_2,c_3
addx c_1,%g0,c_1 !=
st c_2,[%i0+16] !r[4]=c2;
st c_2,r(4) !r[4]=c2;
umul a_2,b_3,t_1 !mul_add_c(a[2],b[3],c3,c1,c2);
addcc c_3,t_1,c_3
@ -1065,15 +1055,15 @@ bn_mul_comba4:
addcc c_3,t_1,c_3 !=
rd %y,t_2
addxcc c_1,t_2,c_1
st c_3,[%i0+20] !r[5]=c3;
st c_3,r(5) !r[5]=c3;
addx c_2,%g0,c_2 !=
umul a_3,b_3,t_1 !mul_add_c(a[3],b[3],c1,c2,c3);
addcc c_1,t_1,c_1
rd %y,t_2
addxcc c_2,t_2,c_2 !=
st c_1,[%i0+24] !r[6]=c1;
st c_2,[%i0+28] !r[7]=c2;
st c_1,r(6) !r[6]=c1;
st c_2,r(7) !r[7]=c2;
ret
restore %g0,%g0,%o0
@ -1086,13 +1076,13 @@ bn_mul_comba4:
.global bn_sqr_comba8
bn_sqr_comba8:
save %sp,FRAME_SIZE,%sp
ld a_0_,a_0
ld a_1_,a_1
ld a(0),a_0
ld a(1),a_1
umul a_0,a_0,c_1 !=!sqr_add_c(a,0,c1,c2,c3);
rd %y,c_2
st c_1,[%i0] !r[0]=c1;
st c_1,r(0) !r[0]=c1;
ld a_2_,a_2
ld a(2),a_2
umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
addcc c_2,t_1,c_2
rd %y,t_2
@ -1100,7 +1090,7 @@ bn_sqr_comba8:
addx %g0,%g0,c_1 !=
addcc c_2,t_1,c_2
addxcc c_3,t_2,c_3
st c_2,[%i0+4] !r[1]=c2;
st c_2,r(1) !r[1]=c2;
addx c_1,%g0,c_1 !=
umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
@ -1111,13 +1101,13 @@ bn_sqr_comba8:
addcc c_3,t_1,c_3
addxcc c_1,t_2,c_1
addx c_2,%g0,c_2 !=
ld a_3_,a_3
ld a(3),a_3
umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
addcc c_3,t_1,c_3
rd %y,t_2 !=
addxcc c_1,t_2,c_1
addx c_2,%g0,c_2
st c_3,[%i0+8] !r[2]=c3;
st c_3,r(2) !r[2]=c3;
umul a_0,a_3,t_1 !=!sqr_add_c2(a,3,0,c1,c2,c3);
addcc c_1,t_1,c_1
@ -1126,7 +1116,7 @@ bn_sqr_comba8:
addx %g0,%g0,c_3 !=
addcc c_1,t_1,c_1
addxcc c_2,t_2,c_2
ld a_4_,a_4
ld a(4),a_4
addx c_3,%g0,c_3 !=
umul a_1,a_2,t_1 !sqr_add_c2(a,2,1,c1,c2,c3);
addcc c_1,t_1,c_1
@ -1136,7 +1126,7 @@ bn_sqr_comba8:
addcc c_1,t_1,c_1
addxcc c_2,t_2,c_2
addx c_3,%g0,c_3 !=
st c_1,[%i0+12] !r[3]=c1;
st c_1,r(3) !r[3]=c1;
umul a_4,a_0,t_1 !sqr_add_c2(a,4,0,c2,c3,c1);
addcc c_2,t_1,c_2
@ -1154,12 +1144,12 @@ bn_sqr_comba8:
addcc c_2,t_1,c_2
addxcc c_3,t_2,c_3 !=
addx c_1,%g0,c_1
ld a_5_,a_5
ld a(5),a_5
umul a_2,a_2,t_1 !sqr_add_c(a,2,c2,c3,c1);
addcc c_2,t_1,c_2 !=
rd %y,t_2
addxcc c_3,t_2,c_3
st c_2,[%i0+16] !r[4]=c2;
st c_2,r(4) !r[4]=c2;
addx c_1,%g0,c_1 !=
umul a_0,a_5,t_1 !sqr_add_c2(a,5,0,c3,c1,c2);
@ -1178,7 +1168,7 @@ bn_sqr_comba8:
addcc c_3,t_1,c_3
addxcc c_1,t_2,c_1
addx c_2,%g0,c_2 !=
ld a_6_,a_6
ld a(6),a_6
umul a_2,a_3,t_1 !sqr_add_c2(a,3,2,c3,c1,c2);
addcc c_3,t_1,c_3
rd %y,t_2 !=
@ -1187,7 +1177,7 @@ bn_sqr_comba8:
addcc c_3,t_1,c_3
addxcc c_1,t_2,c_1 !=
addx c_2,%g0,c_2
st c_3,[%i0+20] !r[5]=c3;
st c_3,r(5) !r[5]=c3;
umul a_6,a_0,t_1 !sqr_add_c2(a,6,0,c1,c2,c3);
addcc c_1,t_1,c_1 !=
@ -1213,13 +1203,13 @@ bn_sqr_comba8:
addcc c_1,t_1,c_1 !=
addxcc c_2,t_2,c_2
addx c_3,%g0,c_3
ld a_7_,a_7
ld a(7),a_7
umul a_3,a_3,t_1 !=!sqr_add_c(a,3,c1,c2,c3);
addcc c_1,t_1,c_1
rd %y,t_2
addxcc c_2,t_2,c_2
addx c_3,%g0,c_3 !=
st c_1,[%i0+24] !r[6]=c1;
st c_1,r(6) !r[6]=c1;
umul a_0,a_7,t_1 !sqr_add_c2(a,7,0,c2,c3,c1);
addcc c_2,t_1,c_2
@ -1253,7 +1243,7 @@ bn_sqr_comba8:
addcc c_2,t_1,c_2
addxcc c_3,t_2,c_3 !=
addx c_1,%g0,c_1
st c_2,[%i0+28] !r[7]=c2;
st c_2,r(7) !r[7]=c2;
umul a_7,a_1,t_1 !sqr_add_c2(a,7,1,c3,c1,c2);
addcc c_3,t_1,c_3 !=
@ -1283,7 +1273,7 @@ bn_sqr_comba8:
addcc c_3,t_1,c_3 !=
rd %y,t_2
addxcc c_1,t_2,c_1
st c_3,[%i0+32] !r[8]=c3;
st c_3,r(8) !r[8]=c3;
addx c_2,%g0,c_2 !=
umul a_2,a_7,t_1 !sqr_add_c2(a,7,2,c1,c2,c3);
@ -1310,7 +1300,7 @@ bn_sqr_comba8:
addcc c_1,t_1,c_1
addxcc c_2,t_2,c_2
addx c_3,%g0,c_3 !=
st c_1,[%i0+36] !r[9]=c1;
st c_1,r(9) !r[9]=c1;
umul a_7,a_3,t_1 !sqr_add_c2(a,7,3,c2,c3,c1);
addcc c_2,t_1,c_2
@ -1333,7 +1323,7 @@ bn_sqr_comba8:
rd %y,t_2 !=
addxcc c_3,t_2,c_3
addx c_1,%g0,c_1
st c_2,[%i0+40] !r[10]=c2;
st c_2,r(10) !r[10]=c2;
umul a_4,a_7,t_1 !=!sqr_add_c2(a,7,4,c3,c1,c2);
addcc c_3,t_1,c_3
@ -1350,7 +1340,7 @@ bn_sqr_comba8:
addx c_2,%g0,c_2 !=
addcc c_3,t_1,c_3
addxcc c_1,t_2,c_1
st c_3,[%i0+44] !r[11]=c3;
st c_3,r(11) !r[11]=c3;
addx c_2,%g0,c_2 !=
umul a_7,a_5,t_1 !sqr_add_c2(a,7,5,c1,c2,c3);
@ -1366,7 +1356,7 @@ bn_sqr_comba8:
rd %y,t_2
addxcc c_2,t_2,c_2 !=
addx c_3,%g0,c_3
st c_1,[%i0+48] !r[12]=c1;
st c_1,r(12) !r[12]=c1;
umul a_6,a_7,t_1 !sqr_add_c2(a,7,6,c2,c3,c1);
addcc c_2,t_1,c_2 !=
@ -1376,15 +1366,15 @@ bn_sqr_comba8:
addcc c_2,t_1,c_2 !=
rd %y,t_2
addxcc c_3,t_2,c_3
st c_2,[%i0+52] !r[13]=c2;
st c_2,r(13) !r[13]=c2;
addx c_1,%g0,c_1 !=
umul a_7,a_7,t_1 !sqr_add_c(a,7,c3,c1,c2);
addcc c_3,t_1,c_3
rd %y,t_2
addxcc c_1,t_2,c_1 !=
st c_3,[%i0+56] !r[14]=c3;
st c_1,[%i0+60] !r[15]=c1;
st c_3,r(14) !r[14]=c3;
st c_1,r(15) !r[15]=c1;
ret
restore %g0,%g0,%o0
@ -1401,23 +1391,23 @@ bn_sqr_comba8:
*/
bn_sqr_comba4:
save %sp,FRAME_SIZE,%sp
ld a_0_,a_0
ld a(0),a_0
umul a_0,a_0,c_1 !sqr_add_c(a,0,c1,c2,c3);
ld a_1_,a_1 !=
ld a(1),a_1 !=
rd %y,c_2
st c_1,[%i0] !r[0]=c1;
st c_1,r(0) !r[0]=c1;
ld a_1_,a_1
ld a(1),a_1
umul a_0,a_1,t_1 !=!sqr_add_c2(a,1,0,c2,c3,c1);
addcc c_2,t_1,c_2
rd %y,t_2
addxcc %g0,t_2,c_3
addx %g0,%g0,c_1 !=
ld a_2_,a_2
ld a(2),a_2
addcc c_2,t_1,c_2
addxcc c_3,t_2,c_3
addx c_1,%g0,c_1 !=
st c_2,[%i0+4] !r[1]=c2;
st c_2,r(1) !r[1]=c2;
umul a_2,a_0,t_1 !sqr_add_c2(a,2,0,c3,c1,c2);
addcc c_3,t_1,c_3
@ -1427,12 +1417,12 @@ bn_sqr_comba4:
addcc c_3,t_1,c_3
addxcc c_1,t_2,c_1 !=
addx c_2,%g0,c_2
ld a_3_,a_3
ld a(3),a_3
umul a_1,a_1,t_1 !sqr_add_c(a,1,c3,c1,c2);
addcc c_3,t_1,c_3 !=
rd %y,t_2
addxcc c_1,t_2,c_1
st c_3,[%i0+8] !r[2]=c3;
st c_3,r(2) !r[2]=c3;
addx c_2,%g0,c_2 !=
umul a_0,a_3,t_1 !sqr_add_c2(a,3,0,c1,c2,c3);
@ -1451,7 +1441,7 @@ bn_sqr_comba4:
addcc c_1,t_1,c_1
addxcc c_2,t_2,c_2
addx c_3,%g0,c_3 !=
st c_1,[%i0+12] !r[3]=c1;
st c_1,r(3) !r[3]=c1;
umul a_3,a_1,t_1 !sqr_add_c2(a,3,1,c2,c3,c1);
addcc c_2,t_1,c_2
@ -1466,7 +1456,7 @@ bn_sqr_comba4:
rd %y,t_2 !=
addxcc c_3,t_2,c_3
addx c_1,%g0,c_1
st c_2,[%i0+16] !r[4]=c2;
st c_2,r(4) !r[4]=c2;
umul a_2,a_3,t_1 !=!sqr_add_c2(a,3,2,c3,c1,c2);
addcc c_3,t_1,c_3
@ -1475,20 +1465,20 @@ bn_sqr_comba4:
addx %g0,%g0,c_2 !=
addcc c_3,t_1,c_3
addxcc c_1,t_2,c_1
st c_3,[%i0+20] !r[5]=c3;
st c_3,r(5) !r[5]=c3;
addx c_2,%g0,c_2 !=
umul a_3,a_3,t_1 !sqr_add_c(a,3,c1,c2,c3);
addcc c_1,t_1,c_1
rd %y,t_2
addxcc c_2,t_2,c_2 !=
st c_1,[%i0+24] !r[6]=c1;
st c_2,[%i0+28] !r[7]=c2;
st c_1,r(6) !r[6]=c1;
st c_2,r(7) !r[7]=c2;
ret
restore %g0,%g0,%o0
.type bn_sqr_comba4,#function
.size bn_sqr_comba4,(.-bn_sqr_comba4)
.align 32
.align 32

1569
crypto/bn/asm/sparcv8plus.S Normal file

File diff suppressed because it is too large Load Diff