From fe0686483621d420705e881cd9187788a0691583 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 13 Nov 2011 20:33:41 +0000 Subject: [PATCH] bsaes-x86_64.pl: add Win64 SEH and "hadrware" calls to aes-x86_64.pl. --- crypto/aes/asm/aes-x86_64.pl | 9 ++ crypto/aes/asm/bsaes-x86_64.pl | 233 +++++++++++++++++++++++++++++---- 2 files changed, 216 insertions(+), 26 deletions(-) diff --git a/crypto/aes/asm/aes-x86_64.pl b/crypto/aes/asm/aes-x86_64.pl index 674a68c43e..1be1266762 100755 --- a/crypto/aes/asm/aes-x86_64.pl +++ b/crypto/aes/asm/aes-x86_64.pl @@ -588,6 +588,9 @@ $code.=<<___; .globl AES_encrypt .type AES_encrypt,\@function,3 .align 16 +.globl asm_AES_encrypt +.hidden asm_AES_encrypt +asm_AES_encrypt: AES_encrypt: push %rbx push %rbp @@ -1184,6 +1187,9 @@ $code.=<<___; .globl AES_decrypt .type AES_decrypt,\@function,3 .align 16 +.globl asm_AES_decrypt +.hidden asm_AES_decrypt +asm_AES_decrypt: AES_decrypt: push %rbx push %rbp @@ -1644,6 +1650,9 @@ $code.=<<___; .type AES_cbc_encrypt,\@function,6 .align 16 .extern OPENSSL_ia32cap_P +.globl asm_AES_cbc_encrypt +.hidden asm_AES_cbc_encrypt +asm_AES_cbc_encrypt: AES_cbc_encrypt: cmp \$0,%rdx # check length je .Lcbc_epilogue diff --git a/crypto/aes/asm/bsaes-x86_64.pl b/crypto/aes/asm/bsaes-x86_64.pl index 89853d28b3..edc70fa1be 100644 --- a/crypto/aes/asm/bsaes-x86_64.pl +++ b/crypto/aes/asm/bsaes-x86_64.pl @@ -88,8 +88,8 @@ # # November 2011. # -# Add bsaes_xts_[en|de]crypt. Small-block performance is suboptimal, -# but XTS is meant to be used with larger blocks... +# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is +# suboptimal, but XTS is meant to be used with larger blocks... # # @@ -108,6 +108,7 @@ open STDOUT,"| $^X $xlate $flavour $output"; my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) +my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... { my ($key,$rounds,$const)=("%rax","%r10d","%r11"); @@ -743,8 +744,8 @@ ___ $code.=<<___; .text -.extern AES_encrypt -.extern AES_decrypt +.extern asm_AES_encrypt +.extern asm_AES_decrypt .type _bsaes_encrypt8,\@abi-omnipotent .align 64 @@ -950,7 +951,7 @@ $code.=<<___; ___ } -if (1 && !$win64) { # following four functions are unsupported interface +if (0 && !$win64) { # following four functions are unsupported interface # used for benchmarking... $code.=<<___; .globl bsaes_enc_key_convert @@ -1056,12 +1057,14 @@ my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); -if (0) { # suppress unreferenced ECB subroutines, spare some space... +if ($ecb) { $code.=<<___; .globl bsaes_ecb_encrypt_blocks .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent .align 16 bsaes_ecb_encrypt_blocks: + mov %rsp, %rax +.Lecb_enc_prologue: push %rbp push %rbx push %r12 @@ -1213,7 +1216,7 @@ $code.=<<___; lea ($inp), $arg1 lea ($out), $arg2 lea ($key), $arg3 - call AES_encrypt + call asm_AES_encrypt lea 16($inp), $inp lea 16($out), $out dec $len @@ -1250,8 +1253,9 @@ $code.=<<___; mov 0x58(%rsp), %r13 mov 0x60(%rsp), %r12 mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rbp + mov 0x70(%rsp), %rax lea 0x78(%rsp), %rsp + mov %rax, %rbp .Lecb_enc_epilogue: ret .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks @@ -1260,6 +1264,8 @@ $code.=<<___; .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent .align 16 bsaes_ecb_decrypt_blocks: + mov %rsp, %rax +.Lecb_dec_prologue: push %rbp push %rbx push %r12 @@ -1412,7 +1418,7 @@ $code.=<<___; lea ($inp), $arg1 lea ($out), $arg2 lea ($key), $arg3 - call AES_decrypt + call asm_AES_decrypt lea 16($inp), $inp lea 16($out), $out dec $len @@ -1449,15 +1455,16 @@ $code.=<<___; mov 0x58(%rsp), %r13 mov 0x60(%rsp), %r12 mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rbp + mov 0x70(%rsp), %rax lea 0x78(%rsp), %rsp + mov %rax, %rbp .Lecb_dec_epilogue: ret .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks ___ } $code.=<<___; -.extern AES_cbc_encrypt +.extern asm_AES_cbc_encrypt .globl bsaes_cbc_encrypt .type bsaes_cbc_encrypt,\@abi-omnipotent .align 16 @@ -1468,10 +1475,12 @@ $code.=<<___ if ($win64); ___ $code.=<<___; cmp \$0,$arg6 - jne AES_cbc_encrypt + jne asm_AES_cbc_encrypt cmp \$128,$arg3 - jb AES_cbc_encrypt + jb asm_AES_cbc_encrypt + mov %rsp, %rax +.Lcbc_dec_prologue: push %rbp push %rbx push %r12 @@ -1699,7 +1708,7 @@ $code.=<<___; lea ($inp), $arg1 lea 0x20(%rbp), $arg2 # buffer output lea ($key), $arg3 - call AES_decrypt # doesn't touch %xmm + call asm_AES_decrypt # doesn't touch %xmm pxor 0x20(%rbp), @XMM[15] # ^= IV movdqu @XMM[15], ($out) # write output movdqa @XMM[0], @XMM[15] # IV @@ -1736,8 +1745,9 @@ $code.=<<___; mov 0x58(%rsp), %r13 mov 0x60(%rsp), %r12 mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rbp + mov 0x70(%rsp), %rax lea 0x78(%rsp), %rsp + mov %rax, %rbp .Lcbc_dec_epilogue: ret .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt @@ -1746,6 +1756,8 @@ $code.=<<___; .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent .align 16 bsaes_ctr32_encrypt_blocks: + mov %rsp, %rax +.Lctr_enc_prologue: push %rbp push %rbx push %r12 @@ -1919,7 +1931,7 @@ $code.=<<___; lea 0x20(%rbp), $arg1 lea 0x30(%rbp), $arg2 lea ($key), $arg3 - call AES_encrypt + call asm_AES_encrypt movdqu ($inp), @XMM[1] lea 16($inp), $inp mov 0x2c(%rbp), %eax # load 32-bit counter @@ -1964,8 +1976,9 @@ $code.=<<___; mov 0x58(%rsp), %r13 mov 0x60(%rsp), %r12 mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rbp + mov 0x70(%rsp), %rax lea 0x78(%rsp), %rsp + mov %rax, %rbp .Lctr_enc_epilogue: ret .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks @@ -1981,6 +1994,8 @@ $code.=<<___; .type bsaes_xts_encrypt,\@abi-omnipotent .align 16 bsaes_xts_encrypt: + mov %rsp, %rax +.Lxts_enc_prologue: push %rbp push %rbx push %r12 @@ -2015,7 +2030,7 @@ $code.=<<___; lea ($arg6), $arg1 lea 0x20(%rbp), $arg2 lea ($arg5), $arg3 - call AES_encrypt # generate initial tweak + call asm_AES_encrypt # generate initial tweak mov 240($key), %eax # rounds mov $len, %rbx # backup $len @@ -2281,7 +2296,7 @@ $code.=<<___; lea 0x20(%rbp), $arg1 lea 0x20(%rbp), $arg2 lea ($key), $arg3 - call AES_encrypt # doesn't touch %xmm + call asm_AES_encrypt # doesn't touch %xmm pxor 0x20(%rbp), @XMM[0] # ^= tweak[] #pxor @XMM[8], @XMM[0] #lea 0x80(%rsp), %rax # pass key schedule @@ -2314,7 +2329,7 @@ $code.=<<___; lea 0x20(%rbp), $arg2 movdqa @XMM[0], 0x20(%rbp) lea ($key), $arg3 - call AES_encrypt # doesn't touch %xmm + call asm_AES_encrypt # doesn't touch %xmm pxor 0x20(%rbp), @XMM[7] movdqu @XMM[7], -16($out) @@ -2349,8 +2364,9 @@ $code.=<<___; mov 0x58(%rsp), %r13 mov 0x60(%rsp), %r12 mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rbp + mov 0x70(%rsp), %rax lea 0x78(%rsp), %rsp + mov %rax, %rbp .Lxts_enc_epilogue: ret .size bsaes_xts_encrypt,.-bsaes_xts_encrypt @@ -2359,6 +2375,8 @@ $code.=<<___; .type bsaes_xts_decrypt,\@abi-omnipotent .align 16 bsaes_xts_decrypt: + mov %rsp, %rax +.Lxts_dec_prologue: push %rbp push %rbx push %r12 @@ -2393,7 +2411,7 @@ $code.=<<___; lea ($arg6), $arg1 lea 0x20(%rbp), $arg2 lea ($arg5), $arg3 - call AES_encrypt # generate initial tweak + call asm_AES_encrypt # generate initial tweak mov 240($key), %eax # rounds mov $len, %rbx # backup $len @@ -2666,7 +2684,7 @@ $code.=<<___; lea 0x20(%rbp), $arg1 lea 0x20(%rbp), $arg2 lea ($key), $arg3 - call AES_decrypt # doesn't touch %xmm + call asm_AES_decrypt # doesn't touch %xmm pxor 0x20(%rbp), @XMM[0] # ^= tweak[] #pxor @XMM[8], @XMM[0] #lea 0x80(%rsp), %rax # pass key schedule @@ -2697,7 +2715,7 @@ $code.=<<___; lea 0x20(%rbp), $arg2 movdqa @XMM[0], 0x20(%rbp) lea ($key), $arg3 - call AES_decrypt # doesn't touch %xmm + call asm_AES_decrypt # doesn't touch %xmm pxor 0x20(%rbp), @XMM[7] mov $out, %rdx movdqu @XMM[7], ($out) @@ -2718,7 +2736,7 @@ $code.=<<___; lea 0x20(%rbp), $arg2 movdqa @XMM[0], 0x20(%rbp) lea ($key), $arg3 - call AES_decrypt # doesn't touch %xmm + call asm_AES_decrypt # doesn't touch %xmm pxor 0x20(%rbp), @XMM[6] movdqu @XMM[6], ($out) @@ -2753,8 +2771,9 @@ $code.=<<___; mov 0x58(%rsp), %r13 mov 0x60(%rsp), %r12 mov 0x68(%rsp), %rbx - mov 0x70(%rsp), %rbp + mov 0x70(%rsp), %rax lea 0x78(%rsp), %rsp + mov %rax, %rbp .Lxts_dec_epilogue: ret .size bsaes_xts_decrypt,.-bsaes_xts_decrypt @@ -2815,6 +2834,168 @@ _bsaes_const: .size _bsaes_const,.-_bsaes_const ___ +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue + + mov 160($context),%rax # pull context->Rbp + + lea 0x40(%rax),%rsi # %xmm save area + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0xa0(%rax),%rax # adjust stack pointer + + mov 0x70(%rax),%rbp + mov 0x68(%rax),%rbx + mov 0x60(%rax),%r12 + mov 0x58(%rax),%r13 + mov 0x50(%rax),%r14 + mov 0x48(%rax),%r15 + lea 0x78(%rax),%rax # adjust stack pointer + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lin_prologue: + mov %rax,152($context) # restore context->Rsp + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$`1232/8`,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 +___ +$code.=<<___ if ($ecb); + .rva .Lecb_enc_prologue + .rva .Lecb_enc_epilogue + .rva .Lecb_enc_info + + .rva .Lecb_dec_prologue + .rva .Lecb_dec_epilogue + .rva .Lecb_dec_info +___ +$code.=<<___; + .rva .Lcbc_dec_prologue + .rva .Lcbc_dec_epilogue + .rva .Lcbc_dec_info + + .rva .Lctr_enc_prologue + .rva .Lctr_enc_epilogue + .rva .Lctr_enc_info + + .rva .Lxts_enc_prologue + .rva .Lxts_enc_epilogue + .rva .Lxts_enc_info + + .rva .Lxts_dec_prologue + .rva .Lxts_dec_epilogue + .rva .Lxts_dec_info + +.section .xdata +.align 8 +___ +$code.=<<___ if ($ecb); +.Lecb_enc_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] +.Lecb_dec_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] +___ +$code.=<<___; +.Lcbc_dec_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] +.Lctr_enc_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] +.Lxts_enc_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] +.Lxts_dec_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] +___ +} + $code =~ s/\`([^\`]*)\`/eval($1)/gem; print $code;