mirror of
https://github.com/openssl/openssl.git
synced 2025-02-17 14:32:04 +08:00
ARM64 assembly pack: make it Windows-friendly.
"Windows friendliness" means a) unified PIC-ification, unified across all platforms; b) unified commantary delimiter; c) explicit ldur/stur, as Visual Studio assembler can't automatically encode ldr/str as ldur/stur when needed. Reviewed-by: Paul Dale <paul.dale@oracle.com> Reviewed-by: Richard Levitte <levitte@openssl.org> (Merged from https://github.com/openssl/openssl/pull/8256)
This commit is contained in:
parent
3405db97e5
commit
db42bb440e
@ -150,12 +150,12 @@ my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
|
||||
my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
|
||||
|
||||
$code.=<<___;
|
||||
##
|
||||
## _aes_preheat
|
||||
##
|
||||
## Fills register %r10 -> .aes_consts (so you can -fPIC)
|
||||
## and %xmm9-%xmm15 as specified below.
|
||||
##
|
||||
//
|
||||
// _aes_preheat
|
||||
//
|
||||
// Fills register %r10 -> .aes_consts (so you can -fPIC)
|
||||
// and %xmm9-%xmm15 as specified below.
|
||||
//
|
||||
.type _vpaes_encrypt_preheat,%function
|
||||
.align 4
|
||||
_vpaes_encrypt_preheat:
|
||||
@ -167,21 +167,21 @@ _vpaes_encrypt_preheat:
|
||||
ret
|
||||
.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
|
||||
|
||||
##
|
||||
## _aes_encrypt_core
|
||||
##
|
||||
## AES-encrypt %xmm0.
|
||||
##
|
||||
## Inputs:
|
||||
## %xmm0 = input
|
||||
## %xmm9-%xmm15 as in _vpaes_preheat
|
||||
## (%rdx) = scheduled keys
|
||||
##
|
||||
## Output in %xmm0
|
||||
## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
|
||||
## Preserves %xmm6 - %xmm8 so you get some local vectors
|
||||
##
|
||||
##
|
||||
//
|
||||
// _aes_encrypt_core
|
||||
//
|
||||
// AES-encrypt %xmm0.
|
||||
//
|
||||
// Inputs:
|
||||
// %xmm0 = input
|
||||
// %xmm9-%xmm15 as in _vpaes_preheat
|
||||
// (%rdx) = scheduled keys
|
||||
//
|
||||
// Output in %xmm0
|
||||
// Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
|
||||
// Preserves %xmm6 - %xmm8 so you get some local vectors
|
||||
//
|
||||
//
|
||||
.type _vpaes_encrypt_core,%function
|
||||
.align 4
|
||||
_vpaes_encrypt_core:
|
||||
@ -387,11 +387,11 @@ _vpaes_decrypt_preheat:
|
||||
ret
|
||||
.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
|
||||
|
||||
##
|
||||
## Decryption core
|
||||
##
|
||||
## Same API as encryption core.
|
||||
##
|
||||
//
|
||||
// Decryption core
|
||||
//
|
||||
// Same API as encryption core.
|
||||
//
|
||||
.type _vpaes_decrypt_core,%function
|
||||
.align 4
|
||||
_vpaes_decrypt_core:
|
||||
@ -643,11 +643,11 @@ my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
|
||||
my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
|
||||
|
||||
$code.=<<___;
|
||||
########################################################
|
||||
## ##
|
||||
## AES key schedule ##
|
||||
## ##
|
||||
########################################################
|
||||
////////////////////////////////////////////////////////
|
||||
// //
|
||||
// AES key schedule //
|
||||
// //
|
||||
////////////////////////////////////////////////////////
|
||||
.type _vpaes_key_preheat,%function
|
||||
.align 4
|
||||
_vpaes_key_preheat:
|
||||
@ -703,14 +703,14 @@ _vpaes_schedule_core:
|
||||
b.eq .Lschedule_192
|
||||
// 128: fall though
|
||||
|
||||
##
|
||||
## .schedule_128
|
||||
##
|
||||
## 128-bit specific part of key schedule.
|
||||
##
|
||||
## This schedule is really simple, because all its parts
|
||||
## are accomplished by the subroutines.
|
||||
##
|
||||
//
|
||||
// .schedule_128
|
||||
//
|
||||
// 128-bit specific part of key schedule.
|
||||
//
|
||||
// This schedule is really simple, because all its parts
|
||||
// are accomplished by the subroutines.
|
||||
//
|
||||
.Lschedule_128:
|
||||
mov $inp, #10 // mov \$10, %esi
|
||||
|
||||
@ -721,21 +721,21 @@ _vpaes_schedule_core:
|
||||
bl _vpaes_schedule_mangle // write output
|
||||
b .Loop_schedule_128
|
||||
|
||||
##
|
||||
## .aes_schedule_192
|
||||
##
|
||||
## 192-bit specific part of key schedule.
|
||||
##
|
||||
## The main body of this schedule is the same as the 128-bit
|
||||
## schedule, but with more smearing. The long, high side is
|
||||
## stored in %xmm7 as before, and the short, low side is in
|
||||
## the high bits of %xmm6.
|
||||
##
|
||||
## This schedule is somewhat nastier, however, because each
|
||||
## round produces 192 bits of key material, or 1.5 round keys.
|
||||
## Therefore, on each cycle we do 2 rounds and produce 3 round
|
||||
## keys.
|
||||
##
|
||||
//
|
||||
// .aes_schedule_192
|
||||
//
|
||||
// 192-bit specific part of key schedule.
|
||||
//
|
||||
// The main body of this schedule is the same as the 128-bit
|
||||
// schedule, but with more smearing. The long, high side is
|
||||
// stored in %xmm7 as before, and the short, low side is in
|
||||
// the high bits of %xmm6.
|
||||
//
|
||||
// This schedule is somewhat nastier, however, because each
|
||||
// round produces 192 bits of key material, or 1.5 round keys.
|
||||
// Therefore, on each cycle we do 2 rounds and produce 3 round
|
||||
// keys.
|
||||
//
|
||||
.align 4
|
||||
.Lschedule_192:
|
||||
sub $inp, $inp, #8
|
||||
@ -759,16 +759,16 @@ _vpaes_schedule_core:
|
||||
bl _vpaes_schedule_192_smear
|
||||
b .Loop_schedule_192
|
||||
|
||||
##
|
||||
## .aes_schedule_256
|
||||
##
|
||||
## 256-bit specific part of key schedule.
|
||||
##
|
||||
## The structure here is very similar to the 128-bit
|
||||
## schedule, but with an additional "low side" in
|
||||
## %xmm6. The low side's rounds are the same as the
|
||||
## high side's, except no rcon and no rotation.
|
||||
##
|
||||
//
|
||||
// .aes_schedule_256
|
||||
//
|
||||
// 256-bit specific part of key schedule.
|
||||
//
|
||||
// The structure here is very similar to the 128-bit
|
||||
// schedule, but with an additional "low side" in
|
||||
// %xmm6. The low side's rounds are the same as the
|
||||
// high side's, except no rcon and no rotation.
|
||||
//
|
||||
.align 4
|
||||
.Lschedule_256:
|
||||
ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
|
||||
@ -795,16 +795,16 @@ _vpaes_schedule_core:
|
||||
|
||||
b .Loop_schedule_256
|
||||
|
||||
##
|
||||
## .aes_schedule_mangle_last
|
||||
##
|
||||
## Mangler for last round of key schedule
|
||||
## Mangles %xmm0
|
||||
## when encrypting, outputs out(%xmm0) ^ 63
|
||||
## when decrypting, outputs unskew(%xmm0)
|
||||
##
|
||||
## Always called right before return... jumps to cleanup and exits
|
||||
##
|
||||
//
|
||||
// .aes_schedule_mangle_last
|
||||
//
|
||||
// Mangler for last round of key schedule
|
||||
// Mangles %xmm0
|
||||
// when encrypting, outputs out(%xmm0) ^ 63
|
||||
// when decrypting, outputs unskew(%xmm0)
|
||||
//
|
||||
// Always called right before return... jumps to cleanup and exits
|
||||
//
|
||||
.align 4
|
||||
.Lschedule_mangle_last:
|
||||
// schedule last round key from xmm0
|
||||
@ -838,20 +838,20 @@ _vpaes_schedule_core:
|
||||
ret
|
||||
.size _vpaes_schedule_core,.-_vpaes_schedule_core
|
||||
|
||||
##
|
||||
## .aes_schedule_192_smear
|
||||
##
|
||||
## Smear the short, low side in the 192-bit key schedule.
|
||||
##
|
||||
## Inputs:
|
||||
## %xmm7: high side, b a x y
|
||||
## %xmm6: low side, d c 0 0
|
||||
## %xmm13: 0
|
||||
##
|
||||
## Outputs:
|
||||
## %xmm6: b+c+d b+c 0 0
|
||||
## %xmm0: b+c+d b+c b a
|
||||
##
|
||||
//
|
||||
// .aes_schedule_192_smear
|
||||
//
|
||||
// Smear the short, low side in the 192-bit key schedule.
|
||||
//
|
||||
// Inputs:
|
||||
// %xmm7: high side, b a x y
|
||||
// %xmm6: low side, d c 0 0
|
||||
// %xmm13: 0
|
||||
//
|
||||
// Outputs:
|
||||
// %xmm6: b+c+d b+c 0 0
|
||||
// %xmm0: b+c+d b+c b a
|
||||
//
|
||||
.type _vpaes_schedule_192_smear,%function
|
||||
.align 4
|
||||
_vpaes_schedule_192_smear:
|
||||
@ -867,24 +867,24 @@ _vpaes_schedule_192_smear:
|
||||
ret
|
||||
.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
|
||||
|
||||
##
|
||||
## .aes_schedule_round
|
||||
##
|
||||
## Runs one main round of the key schedule on %xmm0, %xmm7
|
||||
##
|
||||
## Specifically, runs subbytes on the high dword of %xmm0
|
||||
## then rotates it by one byte and xors into the low dword of
|
||||
## %xmm7.
|
||||
##
|
||||
## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
|
||||
## next rcon.
|
||||
##
|
||||
## Smears the dwords of %xmm7 by xoring the low into the
|
||||
## second low, result into third, result into highest.
|
||||
##
|
||||
## Returns results in %xmm7 = %xmm0.
|
||||
## Clobbers %xmm1-%xmm4, %r11.
|
||||
##
|
||||
//
|
||||
// .aes_schedule_round
|
||||
//
|
||||
// Runs one main round of the key schedule on %xmm0, %xmm7
|
||||
//
|
||||
// Specifically, runs subbytes on the high dword of %xmm0
|
||||
// then rotates it by one byte and xors into the low dword of
|
||||
// %xmm7.
|
||||
//
|
||||
// Adds rcon from low byte of %xmm8, then rotates %xmm8 for
|
||||
// next rcon.
|
||||
//
|
||||
// Smears the dwords of %xmm7 by xoring the low into the
|
||||
// second low, result into third, result into highest.
|
||||
//
|
||||
// Returns results in %xmm7 = %xmm0.
|
||||
// Clobbers %xmm1-%xmm4, %r11.
|
||||
//
|
||||
.type _vpaes_schedule_round,%function
|
||||
.align 4
|
||||
_vpaes_schedule_round:
|
||||
@ -932,15 +932,15 @@ _vpaes_schedule_low_round:
|
||||
ret
|
||||
.size _vpaes_schedule_round,.-_vpaes_schedule_round
|
||||
|
||||
##
|
||||
## .aes_schedule_transform
|
||||
##
|
||||
## Linear-transform %xmm0 according to tables at (%r11)
|
||||
##
|
||||
## Requires that %xmm9 = 0x0F0F... as in preheat
|
||||
## Output in %xmm0
|
||||
## Clobbers %xmm1, %xmm2
|
||||
##
|
||||
//
|
||||
// .aes_schedule_transform
|
||||
//
|
||||
// Linear-transform %xmm0 according to tables at (%r11)
|
||||
//
|
||||
// Requires that %xmm9 = 0x0F0F... as in preheat
|
||||
// Output in %xmm0
|
||||
// Clobbers %xmm1, %xmm2
|
||||
//
|
||||
.type _vpaes_schedule_transform,%function
|
||||
.align 4
|
||||
_vpaes_schedule_transform:
|
||||
@ -954,29 +954,29 @@ _vpaes_schedule_transform:
|
||||
ret
|
||||
.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
|
||||
|
||||
##
|
||||
## .aes_schedule_mangle
|
||||
##
|
||||
## Mangle xmm0 from (basis-transformed) standard version
|
||||
## to our version.
|
||||
##
|
||||
## On encrypt,
|
||||
## xor with 0x63
|
||||
## multiply by circulant 0,1,1,1
|
||||
## apply shiftrows transform
|
||||
##
|
||||
## On decrypt,
|
||||
## xor with 0x63
|
||||
## multiply by "inverse mixcolumns" circulant E,B,D,9
|
||||
## deskew
|
||||
## apply shiftrows transform
|
||||
##
|
||||
##
|
||||
## Writes out to (%rdx), and increments or decrements it
|
||||
## Keeps track of round number mod 4 in %r8
|
||||
## Preserves xmm0
|
||||
## Clobbers xmm1-xmm5
|
||||
##
|
||||
//
|
||||
// .aes_schedule_mangle
|
||||
//
|
||||
// Mangle xmm0 from (basis-transformed) standard version
|
||||
// to our version.
|
||||
//
|
||||
// On encrypt,
|
||||
// xor with 0x63
|
||||
// multiply by circulant 0,1,1,1
|
||||
// apply shiftrows transform
|
||||
//
|
||||
// On decrypt,
|
||||
// xor with 0x63
|
||||
// multiply by "inverse mixcolumns" circulant E,B,D,9
|
||||
// deskew
|
||||
// apply shiftrows transform
|
||||
//
|
||||
//
|
||||
// Writes out to (%rdx), and increments or decrements it
|
||||
// Keeps track of round number mod 4 in %r8
|
||||
// Preserves xmm0
|
||||
// Clobbers xmm1-xmm5
|
||||
//
|
||||
.type _vpaes_schedule_mangle,%function
|
||||
.align 4
|
||||
_vpaes_schedule_mangle:
|
||||
|
@ -197,7 +197,7 @@ bn_mul_mont:
|
||||
mul $nlo,$nj,$m1 // np[j]*m1
|
||||
adds $lo1,$lo1,$lo0
|
||||
umulh $nhi,$nj,$m1
|
||||
str $lo1,[$tp,#-16] // tp[j-1]
|
||||
stur $lo1,[$tp,#-16] // tp[j-1]
|
||||
cbnz $j,.Linner
|
||||
|
||||
.Linner_skip:
|
||||
@ -253,13 +253,13 @@ bn_mul_mont:
|
||||
csel $nj,$tj,$aj,lo // did it borrow?
|
||||
ldr $tj,[$tp],#8
|
||||
ldr $aj,[$rp],#8
|
||||
str xzr,[$tp,#-16] // wipe tp
|
||||
str $nj,[$rp,#-16]
|
||||
stur xzr,[$tp,#-16] // wipe tp
|
||||
stur $nj,[$rp,#-16]
|
||||
cbnz $num,.Lcond_copy
|
||||
|
||||
csel $nj,$tj,$aj,lo
|
||||
str xzr,[$tp,#-8] // wipe tp
|
||||
str $nj,[$rp,#-8]
|
||||
stur xzr,[$tp,#-8] // wipe tp
|
||||
stur $nj,[$rp,#-8]
|
||||
|
||||
ldp x19,x20,[x29,#16]
|
||||
mov sp,x29
|
||||
@ -596,7 +596,7 @@ __bn_sqr8x_mont:
|
||||
ldp $a4,$a5,[$tp,#8*4]
|
||||
ldp $a6,$a7,[$tp,#8*6]
|
||||
adds $acc0,$acc0,$a0
|
||||
ldr $n0,[$rp,#-8*8]
|
||||
ldur $n0,[$rp,#-8*8]
|
||||
adcs $acc1,$acc1,$a1
|
||||
ldp $a0,$a1,[$ap,#8*0]
|
||||
adcs $acc2,$acc2,$a2
|
||||
@ -794,7 +794,7 @@ $code.=<<___;
|
||||
//adc $carry,xzr,xzr // moved below
|
||||
cbz $cnt,.Lsqr8x8_post_condition
|
||||
|
||||
ldr $n0,[$tp,#-8*8]
|
||||
ldur $n0,[$tp,#-8*8]
|
||||
ldp $a0,$a1,[$np,#8*0]
|
||||
ldp $a2,$a3,[$np,#8*2]
|
||||
ldp $a4,$a5,[$np,#8*4]
|
||||
@ -852,7 +852,7 @@ $code.=<<___;
|
||||
ldp $a6,$a7,[$tp,#8*6]
|
||||
cbz $cnt,.Lsqr8x_tail_break
|
||||
|
||||
ldr $n0,[$rp,#-8*8]
|
||||
ldur $n0,[$rp,#-8*8]
|
||||
adds $acc0,$acc0,$a0
|
||||
adcs $acc1,$acc1,$a1
|
||||
ldp $a0,$a1,[$np,#8*0]
|
||||
|
@ -131,12 +131,6 @@ $code.=<<___;
|
||||
.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
|
||||
.Lone:
|
||||
.long 1,0,0,0
|
||||
.LOPENSSL_armcap_P:
|
||||
#ifdef __ILP32__
|
||||
.long OPENSSL_armcap_P-.
|
||||
#else
|
||||
.quad OPENSSL_armcap_P-.
|
||||
#endif
|
||||
.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
|
||||
.globl ChaCha20_ctr32
|
||||
@ -144,17 +138,13 @@ $code.=<<___;
|
||||
.align 5
|
||||
ChaCha20_ctr32:
|
||||
cbz $len,.Labort
|
||||
adr @x[0],.LOPENSSL_armcap_P
|
||||
cmp $len,#192
|
||||
b.lo .Lshort
|
||||
#ifdef __ILP32__
|
||||
ldrsw @x[1],[@x[0]]
|
||||
#else
|
||||
ldr @x[1],[@x[0]]
|
||||
#endif
|
||||
ldr w17,[@x[1],@x[0]]
|
||||
|
||||
adrp x17,OPENSSL_armcap_P
|
||||
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
|
||||
tst w17,#ARMV7_NEON
|
||||
b.ne ChaCha20_neon
|
||||
b.ne .LChaCha20_neon
|
||||
|
||||
.Lshort:
|
||||
.inst 0xd503233f // paciasp
|
||||
@ -380,6 +370,7 @@ $code.=<<___;
|
||||
.type ChaCha20_neon,%function
|
||||
.align 5
|
||||
ChaCha20_neon:
|
||||
.LChaCha20_neon:
|
||||
.inst 0xd503233f // paciasp
|
||||
stp x29,x30,[sp,#-96]!
|
||||
add x29,sp,#0
|
||||
|
@ -1654,7 +1654,7 @@ ecp_nistz256_scatter_w5:
|
||||
|
||||
ldp x4,x5,[$inp] // X
|
||||
ldp x6,x7,[$inp,#16]
|
||||
str w4,[$out,#64*0-4]
|
||||
stur w4,[$out,#64*0-4]
|
||||
lsr x4,x4,#32
|
||||
str w5,[$out,#64*1-4]
|
||||
lsr x5,x5,#32
|
||||
@ -1670,7 +1670,7 @@ ecp_nistz256_scatter_w5:
|
||||
|
||||
ldp x4,x5,[$inp,#32] // Y
|
||||
ldp x6,x7,[$inp,#48]
|
||||
str w4,[$out,#64*0-4]
|
||||
stur w4,[$out,#64*0-4]
|
||||
lsr x4,x4,#32
|
||||
str w5,[$out,#64*1-4]
|
||||
lsr x5,x5,#32
|
||||
@ -1686,7 +1686,7 @@ ecp_nistz256_scatter_w5:
|
||||
|
||||
ldp x4,x5,[$inp,#64] // Z
|
||||
ldp x6,x7,[$inp,#80]
|
||||
str w4,[$out,#64*0-4]
|
||||
stur w4,[$out,#64*0-4]
|
||||
lsr x4,x4,#32
|
||||
str w5,[$out,#64*1-4]
|
||||
lsr x5,x5,#32
|
||||
|
@ -103,6 +103,12 @@ my $asciz = sub {
|
||||
{ ""; }
|
||||
};
|
||||
|
||||
my $adrp = sub {
|
||||
my ($args,$comment) = split(m|\s*//|,shift);
|
||||
"\tadrp\t$args\@PAGE";
|
||||
} if ($flavour =~ /ios64/);
|
||||
|
||||
|
||||
sub range {
|
||||
my ($r,$sfx,$start,$end) = @_;
|
||||
|
||||
@ -132,6 +138,10 @@ sub expand_line {
|
||||
|
||||
$line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge;
|
||||
|
||||
if ($flavour =~ /ios64/) {
|
||||
$line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/;
|
||||
}
|
||||
|
||||
return $line;
|
||||
}
|
||||
|
||||
|
@ -71,17 +71,12 @@ poly1305_init:
|
||||
csel x0,xzr,x0,eq
|
||||
b.eq .Lno_key
|
||||
|
||||
#ifdef __ILP32__
|
||||
ldrsw $t1,.LOPENSSL_armcap_P
|
||||
#else
|
||||
ldr $t1,.LOPENSSL_armcap_P
|
||||
#endif
|
||||
adr $t0,.LOPENSSL_armcap_P
|
||||
adrp x17,OPENSSL_armcap_P
|
||||
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
|
||||
|
||||
ldp $r0,$r1,[$inp] // load key
|
||||
mov $s1,#0xfffffffc0fffffff
|
||||
movk $s1,#0x0fff,lsl#48
|
||||
ldr w17,[$t0,$t1]
|
||||
#ifdef __ARMEB__
|
||||
rev $r0,$r0 // flip bytes
|
||||
rev $r1,$r1
|
||||
@ -93,10 +88,10 @@ poly1305_init:
|
||||
|
||||
tst w17,#ARMV7_NEON
|
||||
|
||||
adr $d0,poly1305_blocks
|
||||
adr $r0,poly1305_blocks_neon
|
||||
adr $d1,poly1305_emit
|
||||
adr $r1,poly1305_emit_neon
|
||||
adr $d0,.Lpoly1305_blocks
|
||||
adr $r0,.Lpoly1305_blocks_neon
|
||||
adr $d1,.Lpoly1305_emit
|
||||
adr $r1,.Lpoly1305_emit_neon
|
||||
|
||||
csel $d0,$d0,$r0,eq
|
||||
csel $d1,$d1,$r1,eq
|
||||
@ -115,6 +110,7 @@ poly1305_init:
|
||||
.type poly1305_blocks,%function
|
||||
.align 5
|
||||
poly1305_blocks:
|
||||
.Lpoly1305_blocks:
|
||||
ands $len,$len,#-16
|
||||
b.eq .Lno_data
|
||||
|
||||
@ -179,6 +175,7 @@ poly1305_blocks:
|
||||
.type poly1305_emit,%function
|
||||
.align 5
|
||||
poly1305_emit:
|
||||
.Lpoly1305_emit:
|
||||
ldp $h0,$h1,[$ctx] // load hash base 2^64
|
||||
ldr $h2,[$ctx,#16]
|
||||
ldp $t0,$t1,[$nonce] // load nonce
|
||||
@ -285,10 +282,11 @@ poly1305_splat:
|
||||
.type poly1305_blocks_neon,%function
|
||||
.align 5
|
||||
poly1305_blocks_neon:
|
||||
.Lpoly1305_blocks_neon:
|
||||
ldr $is_base2_26,[$ctx,#24]
|
||||
cmp $len,#128
|
||||
b.hs .Lblocks_neon
|
||||
cbz $is_base2_26,poly1305_blocks
|
||||
cbz $is_base2_26,.Lpoly1305_blocks
|
||||
|
||||
.Lblocks_neon:
|
||||
.inst 0xd503233f // paciasp
|
||||
@ -431,7 +429,7 @@ poly1305_blocks_neon:
|
||||
csel $in2,$zeros,$in2,lo
|
||||
|
||||
mov x4,#1
|
||||
str x4,[$ctx,#-24] // set is_base2_26
|
||||
stur x4,[$ctx,#-24] // set is_base2_26
|
||||
sub $ctx,$ctx,#48 // restore original $ctx
|
||||
b .Ldo_neon
|
||||
|
||||
@ -868,6 +866,7 @@ poly1305_blocks_neon:
|
||||
.type poly1305_emit_neon,%function
|
||||
.align 5
|
||||
poly1305_emit_neon:
|
||||
.Lpoly1305_emit_neon:
|
||||
ldr $is_base2_26,[$ctx,#24]
|
||||
cbz $is_base2_26,poly1305_emit
|
||||
|
||||
@ -920,12 +919,6 @@ poly1305_emit_neon:
|
||||
.align 5
|
||||
.Lzeros:
|
||||
.long 0,0,0,0,0,0,0,0
|
||||
.LOPENSSL_armcap_P:
|
||||
#ifdef __ILP32__
|
||||
.long OPENSSL_armcap_P-.
|
||||
#else
|
||||
.quad OPENSSL_armcap_P-.
|
||||
#endif
|
||||
.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 2
|
||||
___
|
||||
|
@ -58,10 +58,10 @@ $code.=<<___ if ($i<15 && !($i&1));
|
||||
lsr @Xx[$i+1],@Xx[$i],#32
|
||||
___
|
||||
$code.=<<___ if ($i<14 && !($i&1));
|
||||
ldr @Xx[$i+2],[$inp,#`($i+2)*4-64`]
|
||||
ldur @Xx[$i+2],[$inp,#`($i+2)*4-64`]
|
||||
___
|
||||
$code.=<<___ if ($i<14 && ($i&1));
|
||||
#ifdef __ARMEB__
|
||||
#ifdef __AARCH64EB__
|
||||
ror @Xx[$i+1],@Xx[$i+1],#32
|
||||
#else
|
||||
rev32 @Xx[$i+1],@Xx[$i+1]
|
||||
@ -171,23 +171,19 @@ ___
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
#include "arm_arch.h"
|
||||
#ifndef __KERNEL__
|
||||
# include "arm_arch.h"
|
||||
.extern OPENSSL_armcap_P
|
||||
#endif
|
||||
|
||||
.text
|
||||
|
||||
.extern OPENSSL_armcap_P
|
||||
.globl sha1_block_data_order
|
||||
.type sha1_block_data_order,%function
|
||||
.align 6
|
||||
sha1_block_data_order:
|
||||
#ifdef __ILP32__
|
||||
ldrsw x16,.LOPENSSL_armcap_P
|
||||
#else
|
||||
ldr x16,.LOPENSSL_armcap_P
|
||||
#endif
|
||||
adr x17,.LOPENSSL_armcap_P
|
||||
add x16,x16,x17
|
||||
ldr w16,[x16]
|
||||
adrp x16,OPENSSL_armcap_P
|
||||
ldr w16,[x16,#:lo12:OPENSSL_armcap_P]
|
||||
tst w16,#ARMV8_SHA1
|
||||
b.ne .Lv8_entry
|
||||
|
||||
@ -208,7 +204,7 @@ sha1_block_data_order:
|
||||
movz $K,#0x7999
|
||||
sub $num,$num,#1
|
||||
movk $K,#0x5a82,lsl#16
|
||||
#ifdef __ARMEB__
|
||||
#ifdef __AARCH64EB__
|
||||
ror $Xx[0],@Xx[0],#32
|
||||
#else
|
||||
rev32 @Xx[0],@Xx[0]
|
||||
@ -321,15 +317,11 @@ $code.=<<___;
|
||||
.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39
|
||||
.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59
|
||||
.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79
|
||||
.LOPENSSL_armcap_P:
|
||||
#ifdef __ILP32__
|
||||
.long OPENSSL_armcap_P-.
|
||||
#else
|
||||
.quad OPENSSL_armcap_P-.
|
||||
#endif
|
||||
.asciz "SHA1 block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 2
|
||||
#if !defined(__KERNELL__) && !defined(_WIN64)
|
||||
.comm OPENSSL_armcap_P,4,4
|
||||
#endif
|
||||
___
|
||||
}}}
|
||||
|
||||
|
@ -188,24 +188,18 @@ ___
|
||||
$code.=<<___;
|
||||
#ifndef __KERNEL__
|
||||
# include "arm_arch.h"
|
||||
.extern OPENSSL_armcap_P
|
||||
#endif
|
||||
|
||||
.text
|
||||
|
||||
.extern OPENSSL_armcap_P
|
||||
.globl $func
|
||||
.type $func,%function
|
||||
.align 6
|
||||
$func:
|
||||
#ifndef __KERNEL__
|
||||
# ifdef __ILP32__
|
||||
ldrsw x16,.LOPENSSL_armcap_P
|
||||
# else
|
||||
ldr x16,.LOPENSSL_armcap_P
|
||||
# endif
|
||||
adr x17,.LOPENSSL_armcap_P
|
||||
add x16,x16,x17
|
||||
ldr w16,[x16]
|
||||
adrp x16,OPENSSL_armcap_P
|
||||
ldr w16,[x16,#:lo12:OPENSSL_armcap_P]
|
||||
___
|
||||
$code.=<<___ if ($SZ==4);
|
||||
tst w16,#ARMV8_SHA256
|
||||
@ -353,15 +347,6 @@ $code.=<<___ if ($SZ==4);
|
||||
___
|
||||
$code.=<<___;
|
||||
.size .LK$BITS,.-.LK$BITS
|
||||
#ifndef __KERNEL__
|
||||
.align 3
|
||||
.LOPENSSL_armcap_P:
|
||||
# ifdef __ILP32__
|
||||
.long OPENSSL_armcap_P-.
|
||||
# else
|
||||
.quad OPENSSL_armcap_P-.
|
||||
# endif
|
||||
#endif
|
||||
.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 2
|
||||
___
|
||||
@ -841,7 +826,7 @@ ___
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
#ifndef __KERNEL__
|
||||
#if !defined(__KERNEL__) && !defined(_WIN64)
|
||||
.comm OPENSSL_armcap_P,4,4
|
||||
#endif
|
||||
___
|
||||
|
Loading…
Reference in New Issue
Block a user