From 0bd8d6e2e1b9e534167a021340be4f815bfc5fb1 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 10 May 2007 06:48:28 +0000 Subject: [PATCH] Commentary updates to SHA for sparcv9. --- crypto/sha/asm/sha1-sparcv9.pl | 16 +++++++++------- crypto/sha/asm/sha512-sparcv9.pl | 11 ++++++++++- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/crypto/sha/asm/sha1-sparcv9.pl b/crypto/sha/asm/sha1-sparcv9.pl index 9f2d159514..8306fc88cc 100644 --- a/crypto/sha/asm/sha1-sparcv9.pl +++ b/crypto/sha/asm/sha1-sparcv9.pl @@ -8,13 +8,15 @@ # ==================================================================== # Performance improvement is not really impressive on pre-T1 CPU: +8% -# over Sun C and +25% over gcc [3.3]. While on T1, ... And there -# is a gimmick. X[16] vector is packed to 8 64-bit registers and as -# result nothing is spilled on stack. In addition input data is loaded -# in compact instruction sequence, thus minimizing the window when the -# code is subject to [inter-thread] cache-thrashing hazard. The goal -# is to ensure scalability on UltraSPARC T1, or rather to avoid decay -# when amount of active threads exceeds the number of physical cores. +# over Sun C and +25% over gcc [3.3]. While on T1, a.k.a. Niagara, it +# turned to be 40% faster than 64-bit code generated by Sun C 5.8 and +# >2x than 64-bit code generated by gcc 3.4. And there is a gimmick. +# X[16] vector is packed to 8 64-bit registers and as result nothing +# is spilled on stack. In addition input data is loaded in compact +# instruction sequence, thus minimizing the window when the code is +# subject to [inter-thread] cache-thrashing hazard. The goal is to +# ensure scalability on UltraSPARC T1, or rather to avoid decay when +# amount of active threads exceeds the number of physical cores. $bits=32; for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } diff --git a/crypto/sha/asm/sha512-sparcv9.pl b/crypto/sha/asm/sha512-sparcv9.pl index bd9afcb115..25f80390ac 100644 --- a/crypto/sha/asm/sha512-sparcv9.pl +++ b/crypto/sha/asm/sha512-sparcv9.pl @@ -23,7 +23,16 @@ # # SHA512 on UltraSPARC T1. # -# ... +# It's not any faster than 64-bit code generated by Sun C 5.8. This is +# because 64-bit code generator has the advantage of using 64-bit +# loads to access X[16], which I consciously traded for 32-/64-bit ABI +# duality [as per above]. But it surpasses 32-bit Sun C generated code +# by 60%, not to mention that it doesn't suffer from severe decay when +# running 4 times physical cores threads and that it leaves gcc [3.4] +# behind by over 4x factor! If compared to SHA256, single thread +# performance is only 10% better, but overall throughput for maximum +# amount of threads for given CPU exceeds corresponding one of SHA256 +# by 30% [again, optimal coefficient is 50%]. $bits=32; for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }