RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's

apparently impossible to compose blended code with would perform
satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR
code-path is introduced and P4 core is detected at run-time. This
way we keep original performance on non-P4 implementations and
turbo-charge P4 performance by factor of 2.8x (on 32-bit core).
This commit is contained in:
Andy Polyakov 2004-11-21 10:36:25 +00:00
parent 00dd8f6d6e
commit 376729e130
9 changed files with 242 additions and 33 deletions

View File

@ -318,7 +318,7 @@ my %table=(
"linux-s390x", "gcc:-DB_ENDIAN -DTERMIO -DNO_ASM -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-ia64", "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-x86_64", "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL::asm/x86_64-gcc.o:::::::asm/rc4-amd64.o:::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-x86_64", "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL:amd64cpuid.o:asm/x86_64-gcc.o:::::::asm/rc4-amd64.o:::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-elf-arm","gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-parisc", "gcc:-DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -DBN_DIV2W::-D_REENTRANT:::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::",
#### SPARC Linux setups

4
TABLE
View File

@ -2086,7 +2086,7 @@ $unistd =
$thread_cflag = -D_REENTRANT
$sys_id =
$lflags = -rdynamic -ldl
$bn_ops = DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
$cpuid_obj = x86cpuid-elf.o
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
@ -3572,7 +3572,7 @@ $thread_cflag = -D_REENTRANT
$sys_id =
$lflags = -ldl
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL
$cpuid_obj =
$cpuid_obj = amd64cpuid.o
$bn_obj = asm/x86_64-gcc.o
$des_obj =
$aes_obj =

View File

@ -5,16 +5,63 @@ $win64a=1 if ($output =~ /win64a\.[s|asm]/);
open STDOUT,">$output" || die "can't open $output: $!";
print<<___ if(defined($win64a));
TEXT SEGMENT
_TEXT SEGMENT
PUBLIC OPENSSL_rdtsc
ALIGN 16
OPENSSL_rdtsc PROC NEAR
OPENSSL_rdtsc PROC
rdtsc
shl rdx,32
or rax,rdx
ret
OPENSSL_rdtsc ENDP
TEXT ENDS
PUBLIC OPENSSL_atomic_add
ALIGN 16
OPENSSL_atomic_add PROC
mov eax,DWORD PTR[rcx]
\$Lspin: lea r8,DWORD PTR[rdx+rax]
lock cmpxchg DWORD PTR[rcx],r8d
jne \$Lspin
mov eax,r8d
cdqe
ret
OPENSSL_atomic_add ENDP
PUBLIC OPENSSL_wipe_cpu
ALIGN 16
OPENSSL_wipe_cpu PROC
pxor xmm0,xmm0
pxor xmm1,xmm1
pxor xmm2,xmm2
pxor xmm3,xmm3
pxor xmm4,xmm4
pxor xmm5,xmm5
xor rcx,rcx
xor rdx,rdx
xor r8,r8
xor r9,r9
xor r10,r10
xor r11,r11
lea rax,QWORD PTR[rsp+8]
ret
OPENSSL_wipe_cpu ENDP
OPENSSL_ia32_cpuid PROC
mov r8,rbx
mov eax,1
cpuid
shl rcx,32
mov eax,edx
mov rbx,r8
or rax,rcx
ret
OPENSSL_ia32_cpuid ENDP
_TEXT ENDS
CRT\$XIU SEGMENT
EXTRN OPENSSL_cpuid_setup:PROC
DQ OPENSSL_cpuid_setup
CRT\$XIU ENDS
END
___
print<<___ if(!defined($win64a));
@ -27,4 +74,66 @@ OPENSSL_rdtsc:
or %rdx,%rax
ret
.size OPENSSL_rdtsc,.-OPENSSL_rdtsc
.globl OPENSSL_atomic_add
.type OPENSSL_atomic_add,\@function
.align 16
OPENSSL_atomic_add:
movl (%rdi),%eax
.Lspin: lea (%rsi,%rax),%r8
lock; cmpxchg %r8d,(%rdi)
jne .Lspin
mov %r8d,%eax
cdqe
ret
.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
.globl OPENSSL_wipe_cpu
.type OPENSSL_wipe_cpu,\@function
.align 16
OPENSSL_wipe_cpu:
pxor %xmm0,%xmm0
pxor %xmm1,%xmm1
pxor %xmm2,%xmm2
pxor %xmm3,%xmm3
pxor %xmm4,%xmm4
pxor %xmm5,%xmm5
pxor %xmm6,%xmm6
pxor %xmm7,%xmm7
pxor %xmm8,%xmm8
pxor %xmm9,%xmm9
pxor %xmm10,%xmm10
pxor %xmm11,%xmm11
pxor %xmm12,%xmm12
pxor %xmm13,%xmm13
pxor %xmm14,%xmm14
pxor %xmm15,%xmm15
xor %rcx,%rcx
xor %rdx,%rdx
xor %rsi,%rsi
xor %rdi,%rdi
xor %r8,%r8
xor %r9,%r9
xor %r10,%r10
xor %r11,%r11
lea 8(%rsp),%rax
ret
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
.globl OPENSSL_ia32_cpuid
.align 16
OPENSSL_ia32_cpuid:
mov %rbx,%r8
mov \$1,%eax
cpuid
shl \$32,%rcx
mov %edx,%eax
mov %r8,%rbx
or %rcx,%rax
ret
.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
.section .init
call OPENSSL_cpuid_setup
.align 16
___

View File

@ -161,7 +161,7 @@ sub main'shl { &out2("sall",@_); }
sub main'shr { &out2("shrl",@_); }
sub main'xor { &out2("xorl",@_); }
sub main'xorb { &out2("xorb",@_); }
sub main'add { &out2("addl",@_); }
sub main'add { &out2($_[0]=~/%[a-d][lh]/?"addb":"addl",@_); }
sub main'adc { &out2("adcl",@_); }
sub main'sub { &out2("subl",@_); }
sub main'sbb { &out2("sbbl",@_); }
@ -189,7 +189,7 @@ sub main'jc { &out1("jc",@_); }
sub main'jnc { &out1("jnc",@_); }
sub main'jno { &out1("jno",@_); }
sub main'dec { &out1("decl",@_); }
sub main'inc { &out1("incl",@_); }
sub main'inc { &out1($_[0]=~/%[a-d][hl]/?"incb":"incl",@_); }
sub main'push { &out1("pushl",@_); $stack+=4; }
sub main'pop { &out1("popl",@_); $stack-=4; }
sub main'pushf { &out0("pushfl"); $stack+=4; }
@ -205,9 +205,10 @@ sub main'nop { &out0("nop"); }
sub main'test { &out2("testl",@_); }
sub main'bt { &out2("btl",@_); }
sub main'leave { &out0("leave"); }
sub main'cpuid { &out0(".byte 0x0f; .byte 0xa2"); }
sub main'rdtsc { &out0(".byte 0x0f; .byte 0x31"); }
sub main'cpuid { &out0(".byte\t0x0f,0xa2"); }
sub main'rdtsc { &out0(".byte\t0x0f,0x31"); }
sub main'halt { &out0("hlt"); }
sub main'movz { &out2("movzb",@_); }
# SSE2
sub main'emms { &out0("emms"); }
@ -558,7 +559,7 @@ sub main'file_end
pushl %ebx
movl %edx,%edi
movl \$1,%eax
.byte 0x0f; .byte 0xa2
.byte 0x0f,0xa2
orl \$1<<10,%edx
movl %edx,0(%edi)
popl %ebx

View File

@ -7,10 +7,10 @@ require "x86asm.pl";
&asm_init($ARGV[0],"rc4-586.pl");
$tx="eax";
$ty="ebx";
$x="ecx";
$y="edx";
$x="eax";
$y="ebx";
$tx="ecx";
$ty="edx";
$in="esi";
$out="edi";
$d="ebp";
@ -31,7 +31,7 @@ sub RC4_loop
{
&mov($ty, &swtmp(2));
&cmp($ty, $in);
&jle(&label("finished"));
&jbe(&label("finished"));
&inc($in);
}
else
@ -39,7 +39,7 @@ sub RC4_loop
&add($ty, 8);
&inc($in);
&cmp($ty, $in);
&jl(&label("finished"));
&jb(&label("finished"));
&mov(&swtmp(2), $ty);
}
}
@ -88,35 +88,44 @@ sub RC4
&function_begin_B($name,"");
&mov($ty,&wparam(1)); # len
&cmp($ty,0);
&jne(&label("proceed"));
&ret();
&set_label("proceed");
&comment("");
&push("ebp");
&push("ebx");
&mov( $d, &wparam(0)); # key
&mov( $ty, &wparam(1)); # num
&push("esi");
&push("edi");
&mov( $d, &wparam(0)); # key
&mov( $in, &wparam(2));
&mov( $x, &DWP(0,$d,"",1));
&mov( $y, &DWP(4,$d,"",1));
&mov( $in, &wparam(2));
&mov( $out, &wparam(3));
&inc( $x);
&stack_push(3); # 3 temp variables
&add( $d, 8);
&and( $x, 0xff);
# detect compressed schedule, see commentary section in rc4_skey.c...
&cmp(&DWP(256,$d),-1);
&je(&label("RC4_CHAR"));
&lea( $ty, &DWP(-8,$ty,$in));
# check for 0 length input
&mov( $out, &wparam(3));
&mov( &swtmp(2), $ty); # this is now address to exit at
&mov( $tx, &DWP(0,$d,$x,4));
&cmp( $ty, $in);
&jl( &label("end")); # less than 8 bytes
&jb( &label("end")); # less than 8 bytes
&set_label("start");
@ -148,7 +157,7 @@ sub RC4
&mov( &DWP(-4,$out,"",0), $tx);
&mov( $tx, &DWP(0,$d,$x,4));
&cmp($in, $ty);
&jle(&label("start"));
&jbe(&label("start"));
&set_label("end");
@ -162,6 +171,32 @@ sub RC4
&RC4_loop(5,0,1);
&RC4_loop(6,1,1);
&jmp(&label("finished"));
&align(16);
# this is essentially Intel P4 specific codepath, see rc4_skey.c...
&set_label("RC4_CHAR");
&lea ($ty,&DWP(0,$in,$ty));
&mov (&swtmp(2),$ty);
# strangely enough unrolled loop performs over 20% slower...
&set_label("RC4_CHAR_loop");
&movz ($tx,&BP(0,$d,$x));
&add (&LB($y),&LB($tx));
&movz ($ty,&BP(0,$d,$y));
&movb (&BP(0,$d,$y),&LB($tx));
&movb (&BP(0,$d,$x),&LB($ty));
&add (&LB($ty),&LB($tx));
&movz ($ty,&BP(0,$d,$ty));
&xorb (&LB($ty),&BP(0,$in));
&movb (&BP(0,$out),&LB($ty));
&inc (&LB($x));
&inc ($in);
&inc ($out);
&cmp ($in,&swtmp(2));
&jb (&label("RC4_CHAR_loop"));
&set_label("finished");
&dec( $x);
&stack_pop(3);

View File

@ -25,6 +25,13 @@
# Latter means that if you want to *estimate* what to expect from
# *your* CPU, then multiply 54 by 3.3 and clock frequency in GHz.
# Intel P4 EM64T core was found to run the AMD64 code really slow...
# The only way to achieve comparable performance on P4 is to keep
# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
# compose blended code, which would perform even within 30% marginal
# on either AMD and Intel platforms, I implement both cases. See
# rc4_skey.c for further details...
$output=shift;
$win64a=1 if ($output =~ /win64a.[s|asm]/);
@ -90,6 +97,8 @@ $code.=<<___;
add \$8,$dat
movl `&PTR("DWORD:-8[$dat]")`,$XX#d
movl `&PTR("DWORD:-4[$dat]")`,$YY#d
cmpl \$-1,`&PTR("DWORD:256[$dat]")`
je .LRC4_CHAR
test \$-8,$len
jz .Lloop1
.align 16
@ -167,6 +176,24 @@ $code.=<<___;
dec $len
jnz .Lloop1
jmp .Lexit
.align 16
.LRC4_CHAR:
inc $XX#b
movzb `&PTR("BYTE:[$dat+$XX]")`,$TX#d
add $TX#b,$YY#b
movzb `&PTR("BYTE:[$dat+$YY]")`,$TY#d
movb $TX#b,`&PTR("BYTE:[$dat+$YY]")`
movb $TY#b,`&PTR("BYTE:[$dat+$XX]")`
add $TX#b,$TY#b
movzb `&PTR("BYTE:[$dat+$TY]")`,$TY#d
xorb `&PTR("BYTE:[$inp]")`,$TY#b
movb $TY#b,`&PTR("BYTE:[$out]")`
inc $inp
inc $out
dec $len
jnz .LRC4_CHAR
jmp .Lexit
___
$code.=<<___ if (defined($win64a));
RC4 ENDP
@ -189,6 +216,8 @@ if (defined($win64a)) {
$code =~ s/mov[bwlq]/mov/gm;
$code =~ s/movzb/movzx/gm;
$code =~ s/repret/DB\t0F3h,0C3h/gm;
$code =~ s/cmpl/cmp/gm;
$code =~ s/xorb/xor/gm;
} else {
$code =~ s/([QD]*WORD|BYTE)://gm;
$code =~ s/repret/.byte\t0xF3,0xC3/gm;

View File

@ -1,4 +1,5 @@
#ifndef HEADER_RC4_LOCL_H
#define HEADER_RC4_LOCL_H
#include <openssl/opensslconf.h>
#include <cryptlib.h>
#endif

View File

@ -93,25 +93,58 @@ void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
unsigned int i;
d= &(key->data[0]);
for (i=0; i<256; i++)
d[i]=i;
key->x = 0;
key->y = 0;
id1=id2=0;
#define SK_LOOP(n) { \
#define SK_LOOP(d,n) { \
tmp=d[(n)]; \
id2 = (data[id1] + tmp + id2) & 0xff; \
if (++id1 == len) id1=0; \
d[(n)]=d[id2]; \
d[id2]=tmp; }
#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM)
# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
defined(__INTEL__) || \
defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64)
if (sizeof(RC4_INT) > 1) {
/*
* Unlike all other x86 [and x86_64] implementations,
* Intel P4 core [including EM64T] was found to perform
* poorly with wider RC4_INT. Performance improvement
* for IA-32 hand-coded assembler turned out to be 2.8x
* if re-coded for RC4_CHAR! It's however inappropriate
* to just switch to RC4_CHAR for x86[_64], as non-P4
* implementations suffer from significant performance
* losses then, e.g. PIII exhibits >2x deterioration,
* and so does Opteron. In order to assure optimal
* all-round performance, let us [try to] detect P4 at
* run-time by checking upon HTT bit in CPU capability
* vector and set up compressed key schedule, which is
* recognized by correspondingly updated assembler
* module...
* <appro@fy.chalmers.se>
*/
if (OPENSSL_ia32cap_P & (1<<28)) {
unsigned char *cp=(unsigned char *)d;
for (i=0;i<256;i++) cp[i]=i;
for (i=0;i<256;i++) SK_LOOP(cp,i);
/* mark schedule as compressed! */
d[256/sizeof(RC4_INT)]=-1;
return;
}
}
# endif
#endif
for (i=0; i < 256; i++) d[i]=i;
for (i=0; i < 256; i+=4)
{
SK_LOOP(i+0);
SK_LOOP(i+1);
SK_LOOP(i+2);
SK_LOOP(i+3);
SK_LOOP(d,i+0);
SK_LOOP(d,i+1);
SK_LOOP(d,i+2);
SK_LOOP(d,i+3);
}
}

View File

@ -14,11 +14,12 @@ OPENSSL_ia32cap
Value returned by OPENSSL_ia32cap_loc() is address of a variable
containing IA-32 processor capabilities bit vector as it appears in EDX
register after executing CPUID instruction with EAX=1 input value (see
Intel Application Note #241618). Naturally it's meaningful on IA-32
Intel Application Note #241618). Naturally it's meaningful on IA-32[E]
platforms only. The variable is normally set up automatically upon
toolkit initialization, but can be manipulated afterwards to modify
crypto library behaviour. For the moment of this writing only two bits
are significant, namely bit #26 denoting SSE2 support, and bit #4
crypto library behaviour. For the moment of this writing three bits are
significant, namely bit #28 denoting Hyperthreading, which is used to
distinguish Intel P4 core, bit #26 denoting SSE2 support, and bit #4
denoting presence of Time-Stamp Counter. Clearing bit #26 at run-time
for example disables high-performance SSE2 code present in the crypto
library. You might have to do this if target OpenSSL application is