mirror of
https://github.com/openssl/openssl.git
synced 2024-12-09 05:51:54 +08:00
32be631ca1
If one of the perlasm xlate drivers crashes, OpenSSL's build will currently swallow the error and silently truncate the output to however far the driver got. This will hopefully fail to build, but better to check such things. Handle this by checking for errors when closing STDOUT (which is a pipe to the xlate driver). Reviewed-by: Richard Levitte <levitte@openssl.org> Reviewed-by: Tim Hudson <tjh@openssl.org> Reviewed-by: Tomas Mraz <tmraz@fedoraproject.org> (Merged from https://github.com/openssl/openssl/pull/10883)
2434 lines
46 KiB
Perl
Executable File
2434 lines
46 KiB
Perl
Executable File
#! /usr/bin/env perl
|
|
# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
|
|
# Copyright (c) 2012, Intel Corporation. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License 2.0 (the "License"). You may not use
|
|
# this file except in compliance with the License. You can obtain a copy
|
|
# in the file LICENSE in the source distribution or at
|
|
# https://www.openssl.org/source/license.html
|
|
#
|
|
# Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
|
|
# (1) Intel Corporation, Israel Development Center, Haifa, Israel
|
|
# (2) University of Haifa, Israel
|
|
#
|
|
# References:
|
|
# [1] S. Gueron, "Efficient Software Implementations of Modular
|
|
# Exponentiation", http://eprint.iacr.org/2011/239
|
|
# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".
|
|
# IEEE Proceedings of 9th International Conference on Information
|
|
# Technology: New Generations (ITNG 2012), 821-823 (2012).
|
|
# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation
|
|
# Journal of Cryptographic Engineering 2:31-43 (2012).
|
|
# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
|
|
# resistant 512-bit and 1024-bit modular exponentiation for optimizing
|
|
# RSA1024 and RSA2048 on x86_64 platforms",
|
|
# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest
|
|
#
|
|
# While original submission covers 512- and 1024-bit exponentiation,
|
|
# this module is limited to 512-bit version only (and as such
|
|
# accelerates RSA1024 sign). This is because improvement for longer
|
|
# keys is not high enough to justify the effort, highest measured
|
|
# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
|
|
# for the moment of this writing!] Nor does this module implement
|
|
# "monolithic" complete exponentiation jumbo-subroutine, but adheres
|
|
# to more modular mixture of C and assembly. And it's optimized even
|
|
# for processors other than Intel Core family (see table below for
|
|
# improvement coefficients).
|
|
# <appro@openssl.org>
|
|
#
|
|
# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
|
|
# ----------------+---------------------------
|
|
# Opteron +13% |+5% +20%
|
|
# Bulldozer -0% |-1% +10%
|
|
# P4 +11% |+7% +8%
|
|
# Westmere +5% |+14% +17%
|
|
# Sandy Bridge +2% |+12% +29%
|
|
# Ivy Bridge +1% |+11% +35%
|
|
# Haswell(**) -0% |+12% +39%
|
|
# Atom +13% |+11% +4%
|
|
# VIA Nano +70% |+9% +25%
|
|
#
|
|
# (*) rsax engine and fips numbers are presented for reference
|
|
# purposes;
|
|
# (**) MULX was attempted, but found to give only marginal improvement;
|
|
|
|
# $output is the last argument if it looks like a file (it has an extension)
|
|
# $flavour is the first argument if it doesn't look like a file
|
|
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
|
|
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
|
|
|
|
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
|
die "can't locate x86_64-xlate.pl";
|
|
|
|
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
|
|
or die "can't call $xlate: $!";
|
|
*STDOUT=*OUT;
|
|
|
|
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
|
|
=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
|
|
$addx = ($1>=2.23);
|
|
}
|
|
|
|
if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
|
|
`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
|
|
$addx = ($1>=2.10);
|
|
}
|
|
|
|
if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
|
|
`ml64 2>&1` =~ /Version ([0-9]+)\./) {
|
|
$addx = ($1>=12);
|
|
}
|
|
|
|
if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
|
|
my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
|
|
$addx = ($ver>=3.03);
|
|
}
|
|
|
|
($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
|
|
{
|
|
my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
|
|
|
|
$code.=<<___;
|
|
.text
|
|
|
|
.extern OPENSSL_ia32cap_P
|
|
|
|
.globl rsaz_512_sqr
|
|
.type rsaz_512_sqr,\@function,5
|
|
.align 32
|
|
rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
|
|
.cfi_startproc
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
|
|
subq \$128+24, %rsp
|
|
.cfi_adjust_cfa_offset 128+24
|
|
.Lsqr_body:
|
|
movq $mod, %xmm1 # common off-load
|
|
movq ($inp), %rdx
|
|
movq 8($inp), %rax
|
|
movq $n0, 128(%rsp)
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
movl \$0x80100,%r11d
|
|
andl OPENSSL_ia32cap_P+8(%rip),%r11d
|
|
cmpl \$0x80100,%r11d # check for MULX and ADO/CX
|
|
je .Loop_sqrx
|
|
___
|
|
$code.=<<___;
|
|
jmp .Loop_sqr
|
|
|
|
.align 32
|
|
.Loop_sqr:
|
|
movl $times,128+8(%rsp)
|
|
#first iteration
|
|
movq %rdx, %rbx # 0($inp)
|
|
mov %rax, %rbp # 8($inp)
|
|
mulq %rdx
|
|
movq %rax, %r8
|
|
movq 16($inp), %rax
|
|
movq %rdx, %r9
|
|
|
|
mulq %rbx
|
|
addq %rax, %r9
|
|
movq 24($inp), %rax
|
|
movq %rdx, %r10
|
|
adcq \$0, %r10
|
|
|
|
mulq %rbx
|
|
addq %rax, %r10
|
|
movq 32($inp), %rax
|
|
movq %rdx, %r11
|
|
adcq \$0, %r11
|
|
|
|
mulq %rbx
|
|
addq %rax, %r11
|
|
movq 40($inp), %rax
|
|
movq %rdx, %r12
|
|
adcq \$0, %r12
|
|
|
|
mulq %rbx
|
|
addq %rax, %r12
|
|
movq 48($inp), %rax
|
|
movq %rdx, %r13
|
|
adcq \$0, %r13
|
|
|
|
mulq %rbx
|
|
addq %rax, %r13
|
|
movq 56($inp), %rax
|
|
movq %rdx, %r14
|
|
adcq \$0, %r14
|
|
|
|
mulq %rbx
|
|
addq %rax, %r14
|
|
movq %rbx, %rax
|
|
adcq \$0, %rdx
|
|
|
|
xorq %rcx,%rcx # rcx:r8 = r8 << 1
|
|
addq %r8, %r8
|
|
movq %rdx, %r15
|
|
adcq \$0, %rcx
|
|
|
|
mulq %rax
|
|
addq %r8, %rdx
|
|
adcq \$0, %rcx
|
|
|
|
movq %rax, (%rsp)
|
|
movq %rdx, 8(%rsp)
|
|
|
|
#second iteration
|
|
movq 16($inp), %rax
|
|
mulq %rbp
|
|
addq %rax, %r10
|
|
movq 24($inp), %rax
|
|
movq %rdx, %rbx
|
|
adcq \$0, %rbx
|
|
|
|
mulq %rbp
|
|
addq %rax, %r11
|
|
movq 32($inp), %rax
|
|
adcq \$0, %rdx
|
|
addq %rbx, %r11
|
|
movq %rdx, %rbx
|
|
adcq \$0, %rbx
|
|
|
|
mulq %rbp
|
|
addq %rax, %r12
|
|
movq 40($inp), %rax
|
|
adcq \$0, %rdx
|
|
addq %rbx, %r12
|
|
movq %rdx, %rbx
|
|
adcq \$0, %rbx
|
|
|
|
mulq %rbp
|
|
addq %rax, %r13
|
|
movq 48($inp), %rax
|
|
adcq \$0, %rdx
|
|
addq %rbx, %r13
|
|
movq %rdx, %rbx
|
|
adcq \$0, %rbx
|
|
|
|
mulq %rbp
|
|
addq %rax, %r14
|
|
movq 56($inp), %rax
|
|
adcq \$0, %rdx
|
|
addq %rbx, %r14
|
|
movq %rdx, %rbx
|
|
adcq \$0, %rbx
|
|
|
|
mulq %rbp
|
|
addq %rax, %r15
|
|
movq %rbp, %rax
|
|
adcq \$0, %rdx
|
|
addq %rbx, %r15
|
|
adcq \$0, %rdx
|
|
|
|
xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
|
|
addq %r9, %r9
|
|
movq %rdx, %r8
|
|
adcq %r10, %r10
|
|
adcq \$0, %rbx
|
|
|
|
mulq %rax
|
|
# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
|
addq %rcx, %rax
|
|
movq 16($inp), %rbp
|
|
addq %rax, %r9
|
|
movq 24($inp), %rax
|
|
adcq %rdx, %r10
|
|
adcq \$0, %rbx
|
|
|
|
movq %r9, 16(%rsp)
|
|
movq %r10, 24(%rsp)
|
|
|
|
#third iteration
|
|
mulq %rbp
|
|
addq %rax, %r12
|
|
movq 32($inp), %rax
|
|
movq %rdx, %rcx
|
|
adcq \$0, %rcx
|
|
|
|
mulq %rbp
|
|
addq %rax, %r13
|
|
movq 40($inp), %rax
|
|
adcq \$0, %rdx
|
|
addq %rcx, %r13
|
|
movq %rdx, %rcx
|
|
adcq \$0, %rcx
|
|
|
|
mulq %rbp
|
|
addq %rax, %r14
|
|
movq 48($inp), %rax
|
|
adcq \$0, %rdx
|
|
addq %rcx, %r14
|
|
movq %rdx, %rcx
|
|
adcq \$0, %rcx
|
|
|
|
mulq %rbp
|
|
addq %rax, %r15
|
|
movq 56($inp), %rax
|
|
adcq \$0, %rdx
|
|
addq %rcx, %r15
|
|
movq %rdx, %rcx
|
|
adcq \$0, %rcx
|
|
|
|
mulq %rbp
|
|
addq %rax, %r8
|
|
movq %rbp, %rax
|
|
adcq \$0, %rdx
|
|
addq %rcx, %r8
|
|
adcq \$0, %rdx
|
|
|
|
xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
|
|
addq %r11, %r11
|
|
movq %rdx, %r9
|
|
adcq %r12, %r12
|
|
adcq \$0, %rcx
|
|
|
|
mulq %rax
|
|
# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
|
addq %rbx, %rax
|
|
movq 24($inp), %r10
|
|
addq %rax, %r11
|
|
movq 32($inp), %rax
|
|
adcq %rdx, %r12
|
|
adcq \$0, %rcx
|
|
|
|
movq %r11, 32(%rsp)
|
|
movq %r12, 40(%rsp)
|
|
|
|
#fourth iteration
|
|
mov %rax, %r11 # 32($inp)
|
|
mulq %r10
|
|
addq %rax, %r14
|
|
movq 40($inp), %rax
|
|
movq %rdx, %rbx
|
|
adcq \$0, %rbx
|
|
|
|
mov %rax, %r12 # 40($inp)
|
|
mulq %r10
|
|
addq %rax, %r15
|
|
movq 48($inp), %rax
|
|
adcq \$0, %rdx
|
|
addq %rbx, %r15
|
|
movq %rdx, %rbx
|
|
adcq \$0, %rbx
|
|
|
|
mov %rax, %rbp # 48($inp)
|
|
mulq %r10
|
|
addq %rax, %r8
|
|
movq 56($inp), %rax
|
|
adcq \$0, %rdx
|
|
addq %rbx, %r8
|
|
movq %rdx, %rbx
|
|
adcq \$0, %rbx
|
|
|
|
mulq %r10
|
|
addq %rax, %r9
|
|
movq %r10, %rax
|
|
adcq \$0, %rdx
|
|
addq %rbx, %r9
|
|
adcq \$0, %rdx
|
|
|
|
xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1
|
|
addq %r13, %r13
|
|
movq %rdx, %r10
|
|
adcq %r14, %r14
|
|
adcq \$0, %rbx
|
|
|
|
mulq %rax
|
|
# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
|
addq %rcx, %rax
|
|
addq %rax, %r13
|
|
movq %r12, %rax # 40($inp)
|
|
adcq %rdx, %r14
|
|
adcq \$0, %rbx
|
|
|
|
movq %r13, 48(%rsp)
|
|
movq %r14, 56(%rsp)
|
|
|
|
#fifth iteration
|
|
mulq %r11
|
|
addq %rax, %r8
|
|
movq %rbp, %rax # 48($inp)
|
|
movq %rdx, %rcx
|
|
adcq \$0, %rcx
|
|
|
|
mulq %r11
|
|
addq %rax, %r9
|
|
movq 56($inp), %rax
|
|
adcq \$0, %rdx
|
|
addq %rcx, %r9
|
|
movq %rdx, %rcx
|
|
adcq \$0, %rcx
|
|
|
|
mov %rax, %r14 # 56($inp)
|
|
mulq %r11
|
|
addq %rax, %r10
|
|
movq %r11, %rax
|
|
adcq \$0, %rdx
|
|
addq %rcx, %r10
|
|
adcq \$0, %rdx
|
|
|
|
xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1
|
|
addq %r15, %r15
|
|
movq %rdx, %r11
|
|
adcq %r8, %r8
|
|
adcq \$0, %rcx
|
|
|
|
mulq %rax
|
|
# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
|
addq %rbx, %rax
|
|
addq %rax, %r15
|
|
movq %rbp, %rax # 48($inp)
|
|
adcq %rdx, %r8
|
|
adcq \$0, %rcx
|
|
|
|
movq %r15, 64(%rsp)
|
|
movq %r8, 72(%rsp)
|
|
|
|
#sixth iteration
|
|
mulq %r12
|
|
addq %rax, %r10
|
|
movq %r14, %rax # 56($inp)
|
|
movq %rdx, %rbx
|
|
adcq \$0, %rbx
|
|
|
|
mulq %r12
|
|
addq %rax, %r11
|
|
movq %r12, %rax
|
|
adcq \$0, %rdx
|
|
addq %rbx, %r11
|
|
adcq \$0, %rdx
|
|
|
|
xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
|
|
addq %r9, %r9
|
|
movq %rdx, %r12
|
|
adcq %r10, %r10
|
|
adcq \$0, %rbx
|
|
|
|
mulq %rax
|
|
# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
|
addq %rcx, %rax
|
|
addq %rax, %r9
|
|
movq %r14, %rax # 56($inp)
|
|
adcq %rdx, %r10
|
|
adcq \$0, %rbx
|
|
|
|
movq %r9, 80(%rsp)
|
|
movq %r10, 88(%rsp)
|
|
|
|
#seventh iteration
|
|
mulq %rbp
|
|
addq %rax, %r12
|
|
movq %rbp, %rax
|
|
adcq \$0, %rdx
|
|
|
|
xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
|
|
addq %r11, %r11
|
|
movq %rdx, %r13
|
|
adcq %r12, %r12
|
|
adcq \$0, %rcx
|
|
|
|
mulq %rax
|
|
# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
|
addq %rbx, %rax
|
|
addq %rax, %r11
|
|
movq %r14, %rax # 56($inp)
|
|
adcq %rdx, %r12
|
|
adcq \$0, %rcx
|
|
|
|
movq %r11, 96(%rsp)
|
|
movq %r12, 104(%rsp)
|
|
|
|
#eighth iteration
|
|
xorq %rbx, %rbx # rbx:r13 = r13 << 1
|
|
addq %r13, %r13
|
|
adcq \$0, %rbx
|
|
|
|
mulq %rax
|
|
# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
|
addq %rcx, %rax
|
|
addq %r13, %rax
|
|
adcq %rbx, %rdx
|
|
|
|
movq (%rsp), %r8
|
|
movq 8(%rsp), %r9
|
|
movq 16(%rsp), %r10
|
|
movq 24(%rsp), %r11
|
|
movq 32(%rsp), %r12
|
|
movq 40(%rsp), %r13
|
|
movq 48(%rsp), %r14
|
|
movq 56(%rsp), %r15
|
|
movq %xmm1, %rbp
|
|
|
|
movq %rax, 112(%rsp)
|
|
movq %rdx, 120(%rsp)
|
|
|
|
call __rsaz_512_reduce
|
|
|
|
addq 64(%rsp), %r8
|
|
adcq 72(%rsp), %r9
|
|
adcq 80(%rsp), %r10
|
|
adcq 88(%rsp), %r11
|
|
adcq 96(%rsp), %r12
|
|
adcq 104(%rsp), %r13
|
|
adcq 112(%rsp), %r14
|
|
adcq 120(%rsp), %r15
|
|
sbbq %rcx, %rcx
|
|
|
|
call __rsaz_512_subtract
|
|
|
|
movq %r8, %rdx
|
|
movq %r9, %rax
|
|
movl 128+8(%rsp), $times
|
|
movq $out, $inp
|
|
|
|
decl $times
|
|
jnz .Loop_sqr
|
|
___
|
|
if ($addx) {
|
|
$code.=<<___;
|
|
jmp .Lsqr_tail
|
|
|
|
.align 32
|
|
.Loop_sqrx:
|
|
movl $times,128+8(%rsp)
|
|
movq $out, %xmm0 # off-load
|
|
#first iteration
|
|
mulx %rax, %r8, %r9
|
|
mov %rax, %rbx
|
|
|
|
mulx 16($inp), %rcx, %r10
|
|
xor %rbp, %rbp # cf=0, of=0
|
|
|
|
mulx 24($inp), %rax, %r11
|
|
adcx %rcx, %r9
|
|
|
|
.byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12
|
|
adcx %rax, %r10
|
|
|
|
.byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13
|
|
adcx %rcx, %r11
|
|
|
|
mulx 48($inp), %rcx, %r14
|
|
adcx %rax, %r12
|
|
adcx %rcx, %r13
|
|
|
|
mulx 56($inp), %rax, %r15
|
|
adcx %rax, %r14
|
|
adcx %rbp, %r15 # %rbp is 0
|
|
|
|
mulx %rdx, %rax, $out
|
|
mov %rbx, %rdx # 8($inp)
|
|
xor %rcx, %rcx
|
|
adox %r8, %r8
|
|
adcx $out, %r8
|
|
adox %rbp, %rcx
|
|
adcx %rbp, %rcx
|
|
|
|
mov %rax, (%rsp)
|
|
mov %r8, 8(%rsp)
|
|
|
|
#second iteration
|
|
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx
|
|
adox %rax, %r10
|
|
adcx %rbx, %r11
|
|
|
|
mulx 24($inp), $out, %r8
|
|
adox $out, %r11
|
|
.byte 0x66
|
|
adcx %r8, %r12
|
|
|
|
mulx 32($inp), %rax, %rbx
|
|
adox %rax, %r12
|
|
adcx %rbx, %r13
|
|
|
|
mulx 40($inp), $out, %r8
|
|
adox $out, %r13
|
|
adcx %r8, %r14
|
|
|
|
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
|
|
adox %rax, %r14
|
|
adcx %rbx, %r15
|
|
|
|
.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
|
|
adox $out, %r15
|
|
adcx %rbp, %r8
|
|
mulx %rdx, %rax, $out
|
|
adox %rbp, %r8
|
|
.byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx
|
|
|
|
xor %rbx, %rbx
|
|
adox %r9, %r9
|
|
# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
|
adcx %rcx, %rax
|
|
adox %r10, %r10
|
|
adcx %rax, %r9
|
|
adox %rbp, %rbx
|
|
adcx $out, %r10
|
|
adcx %rbp, %rbx
|
|
|
|
mov %r9, 16(%rsp)
|
|
.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
|
|
|
|
#third iteration
|
|
mulx 24($inp), $out, %r9
|
|
adox $out, %r12
|
|
adcx %r9, %r13
|
|
|
|
mulx 32($inp), %rax, %rcx
|
|
adox %rax, %r13
|
|
adcx %rcx, %r14
|
|
|
|
.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9
|
|
adox $out, %r14
|
|
adcx %r9, %r15
|
|
|
|
.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
|
|
adox %rax, %r15
|
|
adcx %rcx, %r8
|
|
|
|
mulx 56($inp), $out, %r9
|
|
adox $out, %r8
|
|
adcx %rbp, %r9
|
|
mulx %rdx, %rax, $out
|
|
adox %rbp, %r9
|
|
mov 24($inp), %rdx
|
|
|
|
xor %rcx, %rcx
|
|
adox %r11, %r11
|
|
# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
|
adcx %rbx, %rax
|
|
adox %r12, %r12
|
|
adcx %rax, %r11
|
|
adox %rbp, %rcx
|
|
adcx $out, %r12
|
|
adcx %rbp, %rcx
|
|
|
|
mov %r11, 32(%rsp)
|
|
mov %r12, 40(%rsp)
|
|
|
|
#fourth iteration
|
|
mulx 32($inp), %rax, %rbx
|
|
adox %rax, %r14
|
|
adcx %rbx, %r15
|
|
|
|
mulx 40($inp), $out, %r10
|
|
adox $out, %r15
|
|
adcx %r10, %r8
|
|
|
|
mulx 48($inp), %rax, %rbx
|
|
adox %rax, %r8
|
|
adcx %rbx, %r9
|
|
|
|
mulx 56($inp), $out, %r10
|
|
adox $out, %r9
|
|
adcx %rbp, %r10
|
|
mulx %rdx, %rax, $out
|
|
adox %rbp, %r10
|
|
mov 32($inp), %rdx
|
|
|
|
xor %rbx, %rbx
|
|
adox %r13, %r13
|
|
# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
|
adcx %rcx, %rax
|
|
adox %r14, %r14
|
|
adcx %rax, %r13
|
|
adox %rbp, %rbx
|
|
adcx $out, %r14
|
|
adcx %rbp, %rbx
|
|
|
|
mov %r13, 48(%rsp)
|
|
mov %r14, 56(%rsp)
|
|
|
|
#fifth iteration
|
|
mulx 40($inp), $out, %r11
|
|
adox $out, %r8
|
|
adcx %r11, %r9
|
|
|
|
mulx 48($inp), %rax, %rcx
|
|
adox %rax, %r9
|
|
adcx %rcx, %r10
|
|
|
|
mulx 56($inp), $out, %r11
|
|
adox $out, %r10
|
|
adcx %rbp, %r11
|
|
mulx %rdx, %rax, $out
|
|
mov 40($inp), %rdx
|
|
adox %rbp, %r11
|
|
|
|
xor %rcx, %rcx
|
|
adox %r15, %r15
|
|
# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
|
adcx %rbx, %rax
|
|
adox %r8, %r8
|
|
adcx %rax, %r15
|
|
adox %rbp, %rcx
|
|
adcx $out, %r8
|
|
adcx %rbp, %rcx
|
|
|
|
mov %r15, 64(%rsp)
|
|
mov %r8, 72(%rsp)
|
|
|
|
#sixth iteration
|
|
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
|
|
adox %rax, %r10
|
|
adcx %rbx, %r11
|
|
|
|
.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
|
|
adox $out, %r11
|
|
adcx %rbp, %r12
|
|
mulx %rdx, %rax, $out
|
|
adox %rbp, %r12
|
|
mov 48($inp), %rdx
|
|
|
|
xor %rbx, %rbx
|
|
adox %r9, %r9
|
|
# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
|
adcx %rcx, %rax
|
|
adox %r10, %r10
|
|
adcx %rax, %r9
|
|
adcx $out, %r10
|
|
adox %rbp, %rbx
|
|
adcx %rbp, %rbx
|
|
|
|
mov %r9, 80(%rsp)
|
|
mov %r10, 88(%rsp)
|
|
|
|
#seventh iteration
|
|
.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
|
|
adox %rax, %r12
|
|
adox %rbp, %r13
|
|
|
|
mulx %rdx, %rax, $out
|
|
xor %rcx, %rcx
|
|
mov 56($inp), %rdx
|
|
adox %r11, %r11
|
|
# rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
|
adcx %rbx, %rax
|
|
adox %r12, %r12
|
|
adcx %rax, %r11
|
|
adox %rbp, %rcx
|
|
adcx $out, %r12
|
|
adcx %rbp, %rcx
|
|
|
|
.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
|
|
.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
|
|
|
|
#eighth iteration
|
|
mulx %rdx, %rax, %rdx
|
|
xor %rbx, %rbx
|
|
adox %r13, %r13
|
|
# rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
|
|
adcx %rcx, %rax
|
|
adox %rbp, %rbx
|
|
adcx %r13, %rax
|
|
adcx %rdx, %rbx
|
|
|
|
movq %xmm0, $out
|
|
movq %xmm1, %rbp
|
|
|
|
movq 128(%rsp), %rdx # pull $n0
|
|
movq (%rsp), %r8
|
|
movq 8(%rsp), %r9
|
|
movq 16(%rsp), %r10
|
|
movq 24(%rsp), %r11
|
|
movq 32(%rsp), %r12
|
|
movq 40(%rsp), %r13
|
|
movq 48(%rsp), %r14
|
|
movq 56(%rsp), %r15
|
|
|
|
movq %rax, 112(%rsp)
|
|
movq %rbx, 120(%rsp)
|
|
|
|
call __rsaz_512_reducex
|
|
|
|
addq 64(%rsp), %r8
|
|
adcq 72(%rsp), %r9
|
|
adcq 80(%rsp), %r10
|
|
adcq 88(%rsp), %r11
|
|
adcq 96(%rsp), %r12
|
|
adcq 104(%rsp), %r13
|
|
adcq 112(%rsp), %r14
|
|
adcq 120(%rsp), %r15
|
|
sbbq %rcx, %rcx
|
|
|
|
call __rsaz_512_subtract
|
|
|
|
movq %r8, %rdx
|
|
movq %r9, %rax
|
|
movl 128+8(%rsp), $times
|
|
movq $out, $inp
|
|
|
|
decl $times
|
|
jnz .Loop_sqrx
|
|
|
|
.Lsqr_tail:
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
|
|
leaq 128+24+48(%rsp), %rax
|
|
.cfi_def_cfa %rax,8
|
|
movq -48(%rax), %r15
|
|
.cfi_restore %r15
|
|
movq -40(%rax), %r14
|
|
.cfi_restore %r14
|
|
movq -32(%rax), %r13
|
|
.cfi_restore %r13
|
|
movq -24(%rax), %r12
|
|
.cfi_restore %r12
|
|
movq -16(%rax), %rbp
|
|
.cfi_restore %rbp
|
|
movq -8(%rax), %rbx
|
|
.cfi_restore %rbx
|
|
leaq (%rax), %rsp
|
|
.cfi_def_cfa_register %rsp
|
|
.Lsqr_epilogue:
|
|
ret
|
|
.cfi_endproc
|
|
.size rsaz_512_sqr,.-rsaz_512_sqr
|
|
___
|
|
}
|
|
{
|
|
my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
|
|
$code.=<<___;
|
|
.globl rsaz_512_mul
|
|
.type rsaz_512_mul,\@function,5
|
|
.align 32
|
|
rsaz_512_mul:
|
|
.cfi_startproc
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
|
|
subq \$128+24, %rsp
|
|
.cfi_adjust_cfa_offset 128+24
|
|
.Lmul_body:
|
|
movq $out, %xmm0 # off-load arguments
|
|
movq $mod, %xmm1
|
|
movq $n0, 128(%rsp)
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
movl \$0x80100,%r11d
|
|
andl OPENSSL_ia32cap_P+8(%rip),%r11d
|
|
cmpl \$0x80100,%r11d # check for MULX and ADO/CX
|
|
je .Lmulx
|
|
___
|
|
$code.=<<___;
|
|
movq ($bp), %rbx # pass b[0]
|
|
movq $bp, %rbp # pass argument
|
|
call __rsaz_512_mul
|
|
|
|
movq %xmm0, $out
|
|
movq %xmm1, %rbp
|
|
|
|
movq (%rsp), %r8
|
|
movq 8(%rsp), %r9
|
|
movq 16(%rsp), %r10
|
|
movq 24(%rsp), %r11
|
|
movq 32(%rsp), %r12
|
|
movq 40(%rsp), %r13
|
|
movq 48(%rsp), %r14
|
|
movq 56(%rsp), %r15
|
|
|
|
call __rsaz_512_reduce
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
jmp .Lmul_tail
|
|
|
|
.align 32
|
|
.Lmulx:
|
|
movq $bp, %rbp # pass argument
|
|
movq ($bp), %rdx # pass b[0]
|
|
call __rsaz_512_mulx
|
|
|
|
movq %xmm0, $out
|
|
movq %xmm1, %rbp
|
|
|
|
movq 128(%rsp), %rdx # pull $n0
|
|
movq (%rsp), %r8
|
|
movq 8(%rsp), %r9
|
|
movq 16(%rsp), %r10
|
|
movq 24(%rsp), %r11
|
|
movq 32(%rsp), %r12
|
|
movq 40(%rsp), %r13
|
|
movq 48(%rsp), %r14
|
|
movq 56(%rsp), %r15
|
|
|
|
call __rsaz_512_reducex
|
|
.Lmul_tail:
|
|
___
|
|
$code.=<<___;
|
|
addq 64(%rsp), %r8
|
|
adcq 72(%rsp), %r9
|
|
adcq 80(%rsp), %r10
|
|
adcq 88(%rsp), %r11
|
|
adcq 96(%rsp), %r12
|
|
adcq 104(%rsp), %r13
|
|
adcq 112(%rsp), %r14
|
|
adcq 120(%rsp), %r15
|
|
sbbq %rcx, %rcx
|
|
|
|
call __rsaz_512_subtract
|
|
|
|
leaq 128+24+48(%rsp), %rax
|
|
.cfi_def_cfa %rax,8
|
|
movq -48(%rax), %r15
|
|
.cfi_restore %r15
|
|
movq -40(%rax), %r14
|
|
.cfi_restore %r14
|
|
movq -32(%rax), %r13
|
|
.cfi_restore %r13
|
|
movq -24(%rax), %r12
|
|
.cfi_restore %r12
|
|
movq -16(%rax), %rbp
|
|
.cfi_restore %rbp
|
|
movq -8(%rax), %rbx
|
|
.cfi_restore %rbx
|
|
leaq (%rax), %rsp
|
|
.cfi_def_cfa_register %rsp
|
|
.Lmul_epilogue:
|
|
ret
|
|
.cfi_endproc
|
|
.size rsaz_512_mul,.-rsaz_512_mul
|
|
___
|
|
}
|
|
{
|
|
my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
|
|
$code.=<<___;
|
|
.globl rsaz_512_mul_gather4
|
|
.type rsaz_512_mul_gather4,\@function,6
|
|
.align 32
|
|
rsaz_512_mul_gather4:
|
|
.cfi_startproc
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
|
|
subq \$`128+24+($win64?0xb0:0)`, %rsp
|
|
.cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)`
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
movaps %xmm6,0xa0(%rsp)
|
|
movaps %xmm7,0xb0(%rsp)
|
|
movaps %xmm8,0xc0(%rsp)
|
|
movaps %xmm9,0xd0(%rsp)
|
|
movaps %xmm10,0xe0(%rsp)
|
|
movaps %xmm11,0xf0(%rsp)
|
|
movaps %xmm12,0x100(%rsp)
|
|
movaps %xmm13,0x110(%rsp)
|
|
movaps %xmm14,0x120(%rsp)
|
|
movaps %xmm15,0x130(%rsp)
|
|
___
|
|
$code.=<<___;
|
|
.Lmul_gather4_body:
|
|
movd $pwr,%xmm8
|
|
movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
|
|
movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
|
|
|
|
pshufd \$0,%xmm8,%xmm8 # broadcast $power
|
|
movdqa %xmm1,%xmm7
|
|
movdqa %xmm1,%xmm2
|
|
___
|
|
########################################################################
|
|
# calculate mask by comparing 0..15 to $power
|
|
#
|
|
for($i=0;$i<4;$i++) {
|
|
$code.=<<___;
|
|
paddd %xmm`$i`,%xmm`$i+1`
|
|
pcmpeqd %xmm8,%xmm`$i`
|
|
movdqa %xmm7,%xmm`$i+3`
|
|
___
|
|
}
|
|
for(;$i<7;$i++) {
|
|
$code.=<<___;
|
|
paddd %xmm`$i`,%xmm`$i+1`
|
|
pcmpeqd %xmm8,%xmm`$i`
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
pcmpeqd %xmm8,%xmm7
|
|
|
|
movdqa 16*0($bp),%xmm8
|
|
movdqa 16*1($bp),%xmm9
|
|
movdqa 16*2($bp),%xmm10
|
|
movdqa 16*3($bp),%xmm11
|
|
pand %xmm0,%xmm8
|
|
movdqa 16*4($bp),%xmm12
|
|
pand %xmm1,%xmm9
|
|
movdqa 16*5($bp),%xmm13
|
|
pand %xmm2,%xmm10
|
|
movdqa 16*6($bp),%xmm14
|
|
pand %xmm3,%xmm11
|
|
movdqa 16*7($bp),%xmm15
|
|
leaq 128($bp), %rbp
|
|
pand %xmm4,%xmm12
|
|
pand %xmm5,%xmm13
|
|
pand %xmm6,%xmm14
|
|
pand %xmm7,%xmm15
|
|
por %xmm10,%xmm8
|
|
por %xmm11,%xmm9
|
|
por %xmm12,%xmm8
|
|
por %xmm13,%xmm9
|
|
por %xmm14,%xmm8
|
|
por %xmm15,%xmm9
|
|
|
|
por %xmm9,%xmm8
|
|
pshufd \$0x4e,%xmm8,%xmm9
|
|
por %xmm9,%xmm8
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
movl \$0x80100,%r11d
|
|
andl OPENSSL_ia32cap_P+8(%rip),%r11d
|
|
cmpl \$0x80100,%r11d # check for MULX and ADO/CX
|
|
je .Lmulx_gather
|
|
___
|
|
$code.=<<___;
|
|
movq %xmm8,%rbx
|
|
|
|
movq $n0, 128(%rsp) # off-load arguments
|
|
movq $out, 128+8(%rsp)
|
|
movq $mod, 128+16(%rsp)
|
|
|
|
movq ($ap), %rax
|
|
movq 8($ap), %rcx
|
|
mulq %rbx # 0 iteration
|
|
movq %rax, (%rsp)
|
|
movq %rcx, %rax
|
|
movq %rdx, %r8
|
|
|
|
mulq %rbx
|
|
addq %rax, %r8
|
|
movq 16($ap), %rax
|
|
movq %rdx, %r9
|
|
adcq \$0, %r9
|
|
|
|
mulq %rbx
|
|
addq %rax, %r9
|
|
movq 24($ap), %rax
|
|
movq %rdx, %r10
|
|
adcq \$0, %r10
|
|
|
|
mulq %rbx
|
|
addq %rax, %r10
|
|
movq 32($ap), %rax
|
|
movq %rdx, %r11
|
|
adcq \$0, %r11
|
|
|
|
mulq %rbx
|
|
addq %rax, %r11
|
|
movq 40($ap), %rax
|
|
movq %rdx, %r12
|
|
adcq \$0, %r12
|
|
|
|
mulq %rbx
|
|
addq %rax, %r12
|
|
movq 48($ap), %rax
|
|
movq %rdx, %r13
|
|
adcq \$0, %r13
|
|
|
|
mulq %rbx
|
|
addq %rax, %r13
|
|
movq 56($ap), %rax
|
|
movq %rdx, %r14
|
|
adcq \$0, %r14
|
|
|
|
mulq %rbx
|
|
addq %rax, %r14
|
|
movq ($ap), %rax
|
|
movq %rdx, %r15
|
|
adcq \$0, %r15
|
|
|
|
leaq 8(%rsp), %rdi
|
|
movl \$7, %ecx
|
|
jmp .Loop_mul_gather
|
|
|
|
.align 32
|
|
.Loop_mul_gather:
|
|
movdqa 16*0(%rbp),%xmm8
|
|
movdqa 16*1(%rbp),%xmm9
|
|
movdqa 16*2(%rbp),%xmm10
|
|
movdqa 16*3(%rbp),%xmm11
|
|
pand %xmm0,%xmm8
|
|
movdqa 16*4(%rbp),%xmm12
|
|
pand %xmm1,%xmm9
|
|
movdqa 16*5(%rbp),%xmm13
|
|
pand %xmm2,%xmm10
|
|
movdqa 16*6(%rbp),%xmm14
|
|
pand %xmm3,%xmm11
|
|
movdqa 16*7(%rbp),%xmm15
|
|
leaq 128(%rbp), %rbp
|
|
pand %xmm4,%xmm12
|
|
pand %xmm5,%xmm13
|
|
pand %xmm6,%xmm14
|
|
pand %xmm7,%xmm15
|
|
por %xmm10,%xmm8
|
|
por %xmm11,%xmm9
|
|
por %xmm12,%xmm8
|
|
por %xmm13,%xmm9
|
|
por %xmm14,%xmm8
|
|
por %xmm15,%xmm9
|
|
|
|
por %xmm9,%xmm8
|
|
pshufd \$0x4e,%xmm8,%xmm9
|
|
por %xmm9,%xmm8
|
|
movq %xmm8,%rbx
|
|
|
|
mulq %rbx
|
|
addq %rax, %r8
|
|
movq 8($ap), %rax
|
|
movq %r8, (%rdi)
|
|
movq %rdx, %r8
|
|
adcq \$0, %r8
|
|
|
|
mulq %rbx
|
|
addq %rax, %r9
|
|
movq 16($ap), %rax
|
|
adcq \$0, %rdx
|
|
addq %r9, %r8
|
|
movq %rdx, %r9
|
|
adcq \$0, %r9
|
|
|
|
mulq %rbx
|
|
addq %rax, %r10
|
|
movq 24($ap), %rax
|
|
adcq \$0, %rdx
|
|
addq %r10, %r9
|
|
movq %rdx, %r10
|
|
adcq \$0, %r10
|
|
|
|
mulq %rbx
|
|
addq %rax, %r11
|
|
movq 32($ap), %rax
|
|
adcq \$0, %rdx
|
|
addq %r11, %r10
|
|
movq %rdx, %r11
|
|
adcq \$0, %r11
|
|
|
|
mulq %rbx
|
|
addq %rax, %r12
|
|
movq 40($ap), %rax
|
|
adcq \$0, %rdx
|
|
addq %r12, %r11
|
|
movq %rdx, %r12
|
|
adcq \$0, %r12
|
|
|
|
mulq %rbx
|
|
addq %rax, %r13
|
|
movq 48($ap), %rax
|
|
adcq \$0, %rdx
|
|
addq %r13, %r12
|
|
movq %rdx, %r13
|
|
adcq \$0, %r13
|
|
|
|
mulq %rbx
|
|
addq %rax, %r14
|
|
movq 56($ap), %rax
|
|
adcq \$0, %rdx
|
|
addq %r14, %r13
|
|
movq %rdx, %r14
|
|
adcq \$0, %r14
|
|
|
|
mulq %rbx
|
|
addq %rax, %r15
|
|
movq ($ap), %rax
|
|
adcq \$0, %rdx
|
|
addq %r15, %r14
|
|
movq %rdx, %r15
|
|
adcq \$0, %r15
|
|
|
|
leaq 8(%rdi), %rdi
|
|
|
|
decl %ecx
|
|
jnz .Loop_mul_gather
|
|
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq %r12, 32(%rdi)
|
|
movq %r13, 40(%rdi)
|
|
movq %r14, 48(%rdi)
|
|
movq %r15, 56(%rdi)
|
|
|
|
movq 128+8(%rsp), $out
|
|
movq 128+16(%rsp), %rbp
|
|
|
|
movq (%rsp), %r8
|
|
movq 8(%rsp), %r9
|
|
movq 16(%rsp), %r10
|
|
movq 24(%rsp), %r11
|
|
movq 32(%rsp), %r12
|
|
movq 40(%rsp), %r13
|
|
movq 48(%rsp), %r14
|
|
movq 56(%rsp), %r15
|
|
|
|
call __rsaz_512_reduce
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
jmp .Lmul_gather_tail
|
|
|
|
.align 32
|
|
.Lmulx_gather:
|
|
movq %xmm8,%rdx
|
|
|
|
mov $n0, 128(%rsp) # off-load arguments
|
|
mov $out, 128+8(%rsp)
|
|
mov $mod, 128+16(%rsp)
|
|
|
|
mulx ($ap), %rbx, %r8 # 0 iteration
|
|
mov %rbx, (%rsp)
|
|
xor %edi, %edi # cf=0, of=0
|
|
|
|
mulx 8($ap), %rax, %r9
|
|
|
|
mulx 16($ap), %rbx, %r10
|
|
adcx %rax, %r8
|
|
|
|
mulx 24($ap), %rax, %r11
|
|
adcx %rbx, %r9
|
|
|
|
mulx 32($ap), %rbx, %r12
|
|
adcx %rax, %r10
|
|
|
|
mulx 40($ap), %rax, %r13
|
|
adcx %rbx, %r11
|
|
|
|
mulx 48($ap), %rbx, %r14
|
|
adcx %rax, %r12
|
|
|
|
mulx 56($ap), %rax, %r15
|
|
adcx %rbx, %r13
|
|
adcx %rax, %r14
|
|
.byte 0x67
|
|
mov %r8, %rbx
|
|
adcx %rdi, %r15 # %rdi is 0
|
|
|
|
mov \$-7, %rcx
|
|
jmp .Loop_mulx_gather
|
|
|
|
.align 32
|
|
.Loop_mulx_gather:
|
|
movdqa 16*0(%rbp),%xmm8
|
|
movdqa 16*1(%rbp),%xmm9
|
|
movdqa 16*2(%rbp),%xmm10
|
|
movdqa 16*3(%rbp),%xmm11
|
|
pand %xmm0,%xmm8
|
|
movdqa 16*4(%rbp),%xmm12
|
|
pand %xmm1,%xmm9
|
|
movdqa 16*5(%rbp),%xmm13
|
|
pand %xmm2,%xmm10
|
|
movdqa 16*6(%rbp),%xmm14
|
|
pand %xmm3,%xmm11
|
|
movdqa 16*7(%rbp),%xmm15
|
|
leaq 128(%rbp), %rbp
|
|
pand %xmm4,%xmm12
|
|
pand %xmm5,%xmm13
|
|
pand %xmm6,%xmm14
|
|
pand %xmm7,%xmm15
|
|
por %xmm10,%xmm8
|
|
por %xmm11,%xmm9
|
|
por %xmm12,%xmm8
|
|
por %xmm13,%xmm9
|
|
por %xmm14,%xmm8
|
|
por %xmm15,%xmm9
|
|
|
|
por %xmm9,%xmm8
|
|
pshufd \$0x4e,%xmm8,%xmm9
|
|
por %xmm9,%xmm8
|
|
movq %xmm8,%rdx
|
|
|
|
.byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
|
|
adcx %rax, %rbx
|
|
adox %r9, %r8
|
|
|
|
mulx 8($ap), %rax, %r9
|
|
adcx %rax, %r8
|
|
adox %r10, %r9
|
|
|
|
mulx 16($ap), %rax, %r10
|
|
adcx %rax, %r9
|
|
adox %r11, %r10
|
|
|
|
.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
|
|
adcx %rax, %r10
|
|
adox %r12, %r11
|
|
|
|
mulx 32($ap), %rax, %r12
|
|
adcx %rax, %r11
|
|
adox %r13, %r12
|
|
|
|
mulx 40($ap), %rax, %r13
|
|
adcx %rax, %r12
|
|
adox %r14, %r13
|
|
|
|
.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
|
|
adcx %rax, %r13
|
|
.byte 0x67
|
|
adox %r15, %r14
|
|
|
|
mulx 56($ap), %rax, %r15
|
|
mov %rbx, 64(%rsp,%rcx,8)
|
|
adcx %rax, %r14
|
|
adox %rdi, %r15
|
|
mov %r8, %rbx
|
|
adcx %rdi, %r15 # cf=0
|
|
|
|
inc %rcx # of=0
|
|
jnz .Loop_mulx_gather
|
|
|
|
mov %r8, 64(%rsp)
|
|
mov %r9, 64+8(%rsp)
|
|
mov %r10, 64+16(%rsp)
|
|
mov %r11, 64+24(%rsp)
|
|
mov %r12, 64+32(%rsp)
|
|
mov %r13, 64+40(%rsp)
|
|
mov %r14, 64+48(%rsp)
|
|
mov %r15, 64+56(%rsp)
|
|
|
|
mov 128(%rsp), %rdx # pull arguments
|
|
mov 128+8(%rsp), $out
|
|
mov 128+16(%rsp), %rbp
|
|
|
|
mov (%rsp), %r8
|
|
mov 8(%rsp), %r9
|
|
mov 16(%rsp), %r10
|
|
mov 24(%rsp), %r11
|
|
mov 32(%rsp), %r12
|
|
mov 40(%rsp), %r13
|
|
mov 48(%rsp), %r14
|
|
mov 56(%rsp), %r15
|
|
|
|
call __rsaz_512_reducex
|
|
|
|
.Lmul_gather_tail:
|
|
___
|
|
$code.=<<___;
|
|
addq 64(%rsp), %r8
|
|
adcq 72(%rsp), %r9
|
|
adcq 80(%rsp), %r10
|
|
adcq 88(%rsp), %r11
|
|
adcq 96(%rsp), %r12
|
|
adcq 104(%rsp), %r13
|
|
adcq 112(%rsp), %r14
|
|
adcq 120(%rsp), %r15
|
|
sbbq %rcx, %rcx
|
|
|
|
call __rsaz_512_subtract
|
|
|
|
leaq 128+24+48(%rsp), %rax
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
movaps 0xa0-0xc8(%rax),%xmm6
|
|
movaps 0xb0-0xc8(%rax),%xmm7
|
|
movaps 0xc0-0xc8(%rax),%xmm8
|
|
movaps 0xd0-0xc8(%rax),%xmm9
|
|
movaps 0xe0-0xc8(%rax),%xmm10
|
|
movaps 0xf0-0xc8(%rax),%xmm11
|
|
movaps 0x100-0xc8(%rax),%xmm12
|
|
movaps 0x110-0xc8(%rax),%xmm13
|
|
movaps 0x120-0xc8(%rax),%xmm14
|
|
movaps 0x130-0xc8(%rax),%xmm15
|
|
lea 0xb0(%rax),%rax
|
|
___
|
|
$code.=<<___;
|
|
.cfi_def_cfa %rax,8
|
|
movq -48(%rax), %r15
|
|
.cfi_restore %r15
|
|
movq -40(%rax), %r14
|
|
.cfi_restore %r14
|
|
movq -32(%rax), %r13
|
|
.cfi_restore %r13
|
|
movq -24(%rax), %r12
|
|
.cfi_restore %r12
|
|
movq -16(%rax), %rbp
|
|
.cfi_restore %rbp
|
|
movq -8(%rax), %rbx
|
|
.cfi_restore %rbx
|
|
leaq (%rax), %rsp
|
|
.cfi_def_cfa_register %rsp
|
|
.Lmul_gather4_epilogue:
|
|
ret
|
|
.cfi_endproc
|
|
.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
|
|
___
|
|
}
|
|
{
|
|
my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
|
|
$code.=<<___;
|
|
.globl rsaz_512_mul_scatter4
|
|
.type rsaz_512_mul_scatter4,\@function,6
|
|
.align 32
|
|
rsaz_512_mul_scatter4:
|
|
.cfi_startproc
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
|
|
mov $pwr, $pwr
|
|
subq \$128+24, %rsp
|
|
.cfi_adjust_cfa_offset 128+24
|
|
.Lmul_scatter4_body:
|
|
leaq ($tbl,$pwr,8), $tbl
|
|
movq $out, %xmm0 # off-load arguments
|
|
movq $mod, %xmm1
|
|
movq $tbl, %xmm2
|
|
movq $n0, 128(%rsp)
|
|
|
|
movq $out, %rbp
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
movl \$0x80100,%r11d
|
|
andl OPENSSL_ia32cap_P+8(%rip),%r11d
|
|
cmpl \$0x80100,%r11d # check for MULX and ADO/CX
|
|
je .Lmulx_scatter
|
|
___
|
|
$code.=<<___;
|
|
movq ($out),%rbx # pass b[0]
|
|
call __rsaz_512_mul
|
|
|
|
movq %xmm0, $out
|
|
movq %xmm1, %rbp
|
|
|
|
movq (%rsp), %r8
|
|
movq 8(%rsp), %r9
|
|
movq 16(%rsp), %r10
|
|
movq 24(%rsp), %r11
|
|
movq 32(%rsp), %r12
|
|
movq 40(%rsp), %r13
|
|
movq 48(%rsp), %r14
|
|
movq 56(%rsp), %r15
|
|
|
|
call __rsaz_512_reduce
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
jmp .Lmul_scatter_tail
|
|
|
|
.align 32
|
|
.Lmulx_scatter:
|
|
movq ($out), %rdx # pass b[0]
|
|
call __rsaz_512_mulx
|
|
|
|
movq %xmm0, $out
|
|
movq %xmm1, %rbp
|
|
|
|
movq 128(%rsp), %rdx # pull $n0
|
|
movq (%rsp), %r8
|
|
movq 8(%rsp), %r9
|
|
movq 16(%rsp), %r10
|
|
movq 24(%rsp), %r11
|
|
movq 32(%rsp), %r12
|
|
movq 40(%rsp), %r13
|
|
movq 48(%rsp), %r14
|
|
movq 56(%rsp), %r15
|
|
|
|
call __rsaz_512_reducex
|
|
|
|
.Lmul_scatter_tail:
|
|
___
|
|
$code.=<<___;
|
|
addq 64(%rsp), %r8
|
|
adcq 72(%rsp), %r9
|
|
adcq 80(%rsp), %r10
|
|
adcq 88(%rsp), %r11
|
|
adcq 96(%rsp), %r12
|
|
adcq 104(%rsp), %r13
|
|
adcq 112(%rsp), %r14
|
|
adcq 120(%rsp), %r15
|
|
movq %xmm2, $inp
|
|
sbbq %rcx, %rcx
|
|
|
|
call __rsaz_512_subtract
|
|
|
|
movq %r8, 128*0($inp) # scatter
|
|
movq %r9, 128*1($inp)
|
|
movq %r10, 128*2($inp)
|
|
movq %r11, 128*3($inp)
|
|
movq %r12, 128*4($inp)
|
|
movq %r13, 128*5($inp)
|
|
movq %r14, 128*6($inp)
|
|
movq %r15, 128*7($inp)
|
|
|
|
leaq 128+24+48(%rsp), %rax
|
|
.cfi_def_cfa %rax,8
|
|
movq -48(%rax), %r15
|
|
.cfi_restore %r15
|
|
movq -40(%rax), %r14
|
|
.cfi_restore %r14
|
|
movq -32(%rax), %r13
|
|
.cfi_restore %r13
|
|
movq -24(%rax), %r12
|
|
.cfi_restore %r12
|
|
movq -16(%rax), %rbp
|
|
.cfi_restore %rbp
|
|
movq -8(%rax), %rbx
|
|
.cfi_restore %rbx
|
|
leaq (%rax), %rsp
|
|
.cfi_def_cfa_register %rsp
|
|
.Lmul_scatter4_epilogue:
|
|
ret
|
|
.cfi_endproc
|
|
.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
|
|
___
|
|
}
|
|
{
|
|
my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
|
|
$code.=<<___;
|
|
.globl rsaz_512_mul_by_one
|
|
.type rsaz_512_mul_by_one,\@function,4
|
|
.align 32
|
|
rsaz_512_mul_by_one:
|
|
.cfi_startproc
|
|
push %rbx
|
|
.cfi_push %rbx
|
|
push %rbp
|
|
.cfi_push %rbp
|
|
push %r12
|
|
.cfi_push %r12
|
|
push %r13
|
|
.cfi_push %r13
|
|
push %r14
|
|
.cfi_push %r14
|
|
push %r15
|
|
.cfi_push %r15
|
|
|
|
subq \$128+24, %rsp
|
|
.cfi_adjust_cfa_offset 128+24
|
|
.Lmul_by_one_body:
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
movl OPENSSL_ia32cap_P+8(%rip),%eax
|
|
___
|
|
$code.=<<___;
|
|
movq $mod, %rbp # reassign argument
|
|
movq $n0, 128(%rsp)
|
|
|
|
movq ($inp), %r8
|
|
pxor %xmm0, %xmm0
|
|
movq 8($inp), %r9
|
|
movq 16($inp), %r10
|
|
movq 24($inp), %r11
|
|
movq 32($inp), %r12
|
|
movq 40($inp), %r13
|
|
movq 48($inp), %r14
|
|
movq 56($inp), %r15
|
|
|
|
movdqa %xmm0, (%rsp)
|
|
movdqa %xmm0, 16(%rsp)
|
|
movdqa %xmm0, 32(%rsp)
|
|
movdqa %xmm0, 48(%rsp)
|
|
movdqa %xmm0, 64(%rsp)
|
|
movdqa %xmm0, 80(%rsp)
|
|
movdqa %xmm0, 96(%rsp)
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
andl \$0x80100,%eax
|
|
cmpl \$0x80100,%eax # check for MULX and ADO/CX
|
|
je .Lby_one_callx
|
|
___
|
|
$code.=<<___;
|
|
call __rsaz_512_reduce
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
jmp .Lby_one_tail
|
|
.align 32
|
|
.Lby_one_callx:
|
|
movq 128(%rsp), %rdx # pull $n0
|
|
call __rsaz_512_reducex
|
|
.Lby_one_tail:
|
|
___
|
|
$code.=<<___;
|
|
movq %r8, ($out)
|
|
movq %r9, 8($out)
|
|
movq %r10, 16($out)
|
|
movq %r11, 24($out)
|
|
movq %r12, 32($out)
|
|
movq %r13, 40($out)
|
|
movq %r14, 48($out)
|
|
movq %r15, 56($out)
|
|
|
|
leaq 128+24+48(%rsp), %rax
|
|
.cfi_def_cfa %rax,8
|
|
movq -48(%rax), %r15
|
|
.cfi_restore %r15
|
|
movq -40(%rax), %r14
|
|
.cfi_restore %r14
|
|
movq -32(%rax), %r13
|
|
.cfi_restore %r13
|
|
movq -24(%rax), %r12
|
|
.cfi_restore %r12
|
|
movq -16(%rax), %rbp
|
|
.cfi_restore %rbp
|
|
movq -8(%rax), %rbx
|
|
.cfi_restore %rbx
|
|
leaq (%rax), %rsp
|
|
.cfi_def_cfa_register %rsp
|
|
.Lmul_by_one_epilogue:
|
|
ret
|
|
.cfi_endproc
|
|
.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
|
|
___
|
|
}
|
|
{ # __rsaz_512_reduce
|
|
#
|
|
# input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
|
|
# output: %r8-%r15
|
|
# clobbers: everything except %rbp and %rdi
|
|
$code.=<<___;
|
|
.type __rsaz_512_reduce,\@abi-omnipotent
|
|
.align 32
|
|
__rsaz_512_reduce:
|
|
.cfi_startproc
|
|
movq %r8, %rbx
|
|
imulq 128+8(%rsp), %rbx
|
|
movq 0(%rbp), %rax
|
|
movl \$8, %ecx
|
|
jmp .Lreduction_loop
|
|
|
|
.align 32
|
|
.Lreduction_loop:
|
|
mulq %rbx
|
|
movq 8(%rbp), %rax
|
|
negq %r8
|
|
movq %rdx, %r8
|
|
adcq \$0, %r8
|
|
|
|
mulq %rbx
|
|
addq %rax, %r9
|
|
movq 16(%rbp), %rax
|
|
adcq \$0, %rdx
|
|
addq %r9, %r8
|
|
movq %rdx, %r9
|
|
adcq \$0, %r9
|
|
|
|
mulq %rbx
|
|
addq %rax, %r10
|
|
movq 24(%rbp), %rax
|
|
adcq \$0, %rdx
|
|
addq %r10, %r9
|
|
movq %rdx, %r10
|
|
adcq \$0, %r10
|
|
|
|
mulq %rbx
|
|
addq %rax, %r11
|
|
movq 32(%rbp), %rax
|
|
adcq \$0, %rdx
|
|
addq %r11, %r10
|
|
movq 128+8(%rsp), %rsi
|
|
#movq %rdx, %r11
|
|
#adcq \$0, %r11
|
|
adcq \$0, %rdx
|
|
movq %rdx, %r11
|
|
|
|
mulq %rbx
|
|
addq %rax, %r12
|
|
movq 40(%rbp), %rax
|
|
adcq \$0, %rdx
|
|
imulq %r8, %rsi
|
|
addq %r12, %r11
|
|
movq %rdx, %r12
|
|
adcq \$0, %r12
|
|
|
|
mulq %rbx
|
|
addq %rax, %r13
|
|
movq 48(%rbp), %rax
|
|
adcq \$0, %rdx
|
|
addq %r13, %r12
|
|
movq %rdx, %r13
|
|
adcq \$0, %r13
|
|
|
|
mulq %rbx
|
|
addq %rax, %r14
|
|
movq 56(%rbp), %rax
|
|
adcq \$0, %rdx
|
|
addq %r14, %r13
|
|
movq %rdx, %r14
|
|
adcq \$0, %r14
|
|
|
|
mulq %rbx
|
|
movq %rsi, %rbx
|
|
addq %rax, %r15
|
|
movq 0(%rbp), %rax
|
|
adcq \$0, %rdx
|
|
addq %r15, %r14
|
|
movq %rdx, %r15
|
|
adcq \$0, %r15
|
|
|
|
decl %ecx
|
|
jne .Lreduction_loop
|
|
|
|
ret
|
|
.cfi_endproc
|
|
.size __rsaz_512_reduce,.-__rsaz_512_reduce
|
|
___
|
|
}
|
|
if ($addx) {
|
|
# __rsaz_512_reducex
|
|
#
|
|
# input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
|
|
# output: %r8-%r15
|
|
# clobbers: everything except %rbp and %rdi
|
|
$code.=<<___;
|
|
.type __rsaz_512_reducex,\@abi-omnipotent
|
|
.align 32
|
|
__rsaz_512_reducex:
|
|
.cfi_startproc
|
|
#movq 128+8(%rsp), %rdx # pull $n0
|
|
imulq %r8, %rdx
|
|
xorq %rsi, %rsi # cf=0,of=0
|
|
movl \$8, %ecx
|
|
jmp .Lreduction_loopx
|
|
|
|
.align 32
|
|
.Lreduction_loopx:
|
|
mov %r8, %rbx
|
|
mulx 0(%rbp), %rax, %r8
|
|
adcx %rbx, %rax
|
|
adox %r9, %r8
|
|
|
|
mulx 8(%rbp), %rax, %r9
|
|
adcx %rax, %r8
|
|
adox %r10, %r9
|
|
|
|
mulx 16(%rbp), %rbx, %r10
|
|
adcx %rbx, %r9
|
|
adox %r11, %r10
|
|
|
|
mulx 24(%rbp), %rbx, %r11
|
|
adcx %rbx, %r10
|
|
adox %r12, %r11
|
|
|
|
.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
|
|
mov %rdx, %rax
|
|
mov %r8, %rdx
|
|
adcx %rbx, %r11
|
|
adox %r13, %r12
|
|
|
|
mulx 128+8(%rsp), %rbx, %rdx
|
|
mov %rax, %rdx
|
|
|
|
mulx 40(%rbp), %rax, %r13
|
|
adcx %rax, %r12
|
|
adox %r14, %r13
|
|
|
|
.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
|
|
adcx %rax, %r13
|
|
adox %r15, %r14
|
|
|
|
mulx 56(%rbp), %rax, %r15
|
|
mov %rbx, %rdx
|
|
adcx %rax, %r14
|
|
adox %rsi, %r15 # %rsi is 0
|
|
adcx %rsi, %r15 # cf=0
|
|
|
|
decl %ecx # of=0
|
|
jne .Lreduction_loopx
|
|
|
|
ret
|
|
.cfi_endproc
|
|
.size __rsaz_512_reducex,.-__rsaz_512_reducex
|
|
___
|
|
}
|
|
{ # __rsaz_512_subtract
|
|
# input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
|
|
# output:
|
|
# clobbers: everything but %rdi, %rsi and %rbp
|
|
$code.=<<___;
|
|
.type __rsaz_512_subtract,\@abi-omnipotent
|
|
.align 32
|
|
__rsaz_512_subtract:
|
|
.cfi_startproc
|
|
movq %r8, ($out)
|
|
movq %r9, 8($out)
|
|
movq %r10, 16($out)
|
|
movq %r11, 24($out)
|
|
movq %r12, 32($out)
|
|
movq %r13, 40($out)
|
|
movq %r14, 48($out)
|
|
movq %r15, 56($out)
|
|
|
|
movq 0($mod), %r8
|
|
movq 8($mod), %r9
|
|
negq %r8
|
|
notq %r9
|
|
andq %rcx, %r8
|
|
movq 16($mod), %r10
|
|
andq %rcx, %r9
|
|
notq %r10
|
|
movq 24($mod), %r11
|
|
andq %rcx, %r10
|
|
notq %r11
|
|
movq 32($mod), %r12
|
|
andq %rcx, %r11
|
|
notq %r12
|
|
movq 40($mod), %r13
|
|
andq %rcx, %r12
|
|
notq %r13
|
|
movq 48($mod), %r14
|
|
andq %rcx, %r13
|
|
notq %r14
|
|
movq 56($mod), %r15
|
|
andq %rcx, %r14
|
|
notq %r15
|
|
andq %rcx, %r15
|
|
|
|
addq ($out), %r8
|
|
adcq 8($out), %r9
|
|
adcq 16($out), %r10
|
|
adcq 24($out), %r11
|
|
adcq 32($out), %r12
|
|
adcq 40($out), %r13
|
|
adcq 48($out), %r14
|
|
adcq 56($out), %r15
|
|
|
|
movq %r8, ($out)
|
|
movq %r9, 8($out)
|
|
movq %r10, 16($out)
|
|
movq %r11, 24($out)
|
|
movq %r12, 32($out)
|
|
movq %r13, 40($out)
|
|
movq %r14, 48($out)
|
|
movq %r15, 56($out)
|
|
|
|
ret
|
|
.cfi_endproc
|
|
.size __rsaz_512_subtract,.-__rsaz_512_subtract
|
|
___
|
|
}
|
|
{ # __rsaz_512_mul
|
|
#
|
|
# input: %rsi - ap, %rbp - bp
|
|
# output:
|
|
# clobbers: everything
|
|
my ($ap,$bp) = ("%rsi","%rbp");
|
|
$code.=<<___;
|
|
.type __rsaz_512_mul,\@abi-omnipotent
|
|
.align 32
|
|
__rsaz_512_mul:
|
|
.cfi_startproc
|
|
leaq 8(%rsp), %rdi
|
|
|
|
movq ($ap), %rax
|
|
mulq %rbx
|
|
movq %rax, (%rdi)
|
|
movq 8($ap), %rax
|
|
movq %rdx, %r8
|
|
|
|
mulq %rbx
|
|
addq %rax, %r8
|
|
movq 16($ap), %rax
|
|
movq %rdx, %r9
|
|
adcq \$0, %r9
|
|
|
|
mulq %rbx
|
|
addq %rax, %r9
|
|
movq 24($ap), %rax
|
|
movq %rdx, %r10
|
|
adcq \$0, %r10
|
|
|
|
mulq %rbx
|
|
addq %rax, %r10
|
|
movq 32($ap), %rax
|
|
movq %rdx, %r11
|
|
adcq \$0, %r11
|
|
|
|
mulq %rbx
|
|
addq %rax, %r11
|
|
movq 40($ap), %rax
|
|
movq %rdx, %r12
|
|
adcq \$0, %r12
|
|
|
|
mulq %rbx
|
|
addq %rax, %r12
|
|
movq 48($ap), %rax
|
|
movq %rdx, %r13
|
|
adcq \$0, %r13
|
|
|
|
mulq %rbx
|
|
addq %rax, %r13
|
|
movq 56($ap), %rax
|
|
movq %rdx, %r14
|
|
adcq \$0, %r14
|
|
|
|
mulq %rbx
|
|
addq %rax, %r14
|
|
movq ($ap), %rax
|
|
movq %rdx, %r15
|
|
adcq \$0, %r15
|
|
|
|
leaq 8($bp), $bp
|
|
leaq 8(%rdi), %rdi
|
|
|
|
movl \$7, %ecx
|
|
jmp .Loop_mul
|
|
|
|
.align 32
|
|
.Loop_mul:
|
|
movq ($bp), %rbx
|
|
mulq %rbx
|
|
addq %rax, %r8
|
|
movq 8($ap), %rax
|
|
movq %r8, (%rdi)
|
|
movq %rdx, %r8
|
|
adcq \$0, %r8
|
|
|
|
mulq %rbx
|
|
addq %rax, %r9
|
|
movq 16($ap), %rax
|
|
adcq \$0, %rdx
|
|
addq %r9, %r8
|
|
movq %rdx, %r9
|
|
adcq \$0, %r9
|
|
|
|
mulq %rbx
|
|
addq %rax, %r10
|
|
movq 24($ap), %rax
|
|
adcq \$0, %rdx
|
|
addq %r10, %r9
|
|
movq %rdx, %r10
|
|
adcq \$0, %r10
|
|
|
|
mulq %rbx
|
|
addq %rax, %r11
|
|
movq 32($ap), %rax
|
|
adcq \$0, %rdx
|
|
addq %r11, %r10
|
|
movq %rdx, %r11
|
|
adcq \$0, %r11
|
|
|
|
mulq %rbx
|
|
addq %rax, %r12
|
|
movq 40($ap), %rax
|
|
adcq \$0, %rdx
|
|
addq %r12, %r11
|
|
movq %rdx, %r12
|
|
adcq \$0, %r12
|
|
|
|
mulq %rbx
|
|
addq %rax, %r13
|
|
movq 48($ap), %rax
|
|
adcq \$0, %rdx
|
|
addq %r13, %r12
|
|
movq %rdx, %r13
|
|
adcq \$0, %r13
|
|
|
|
mulq %rbx
|
|
addq %rax, %r14
|
|
movq 56($ap), %rax
|
|
adcq \$0, %rdx
|
|
addq %r14, %r13
|
|
movq %rdx, %r14
|
|
leaq 8($bp), $bp
|
|
adcq \$0, %r14
|
|
|
|
mulq %rbx
|
|
addq %rax, %r15
|
|
movq ($ap), %rax
|
|
adcq \$0, %rdx
|
|
addq %r15, %r14
|
|
movq %rdx, %r15
|
|
adcq \$0, %r15
|
|
|
|
leaq 8(%rdi), %rdi
|
|
|
|
decl %ecx
|
|
jnz .Loop_mul
|
|
|
|
movq %r8, (%rdi)
|
|
movq %r9, 8(%rdi)
|
|
movq %r10, 16(%rdi)
|
|
movq %r11, 24(%rdi)
|
|
movq %r12, 32(%rdi)
|
|
movq %r13, 40(%rdi)
|
|
movq %r14, 48(%rdi)
|
|
movq %r15, 56(%rdi)
|
|
|
|
ret
|
|
.cfi_endproc
|
|
.size __rsaz_512_mul,.-__rsaz_512_mul
|
|
___
|
|
}
|
|
if ($addx) {
|
|
# __rsaz_512_mulx
|
|
#
|
|
# input: %rsi - ap, %rbp - bp
|
|
# output:
|
|
# clobbers: everything
|
|
my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
|
|
$code.=<<___;
|
|
.type __rsaz_512_mulx,\@abi-omnipotent
|
|
.align 32
|
|
__rsaz_512_mulx:
|
|
.cfi_startproc
|
|
mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
|
|
mov \$-6, %rcx
|
|
|
|
mulx 8($ap), %rax, %r9
|
|
movq %rbx, 8(%rsp)
|
|
|
|
mulx 16($ap), %rbx, %r10
|
|
adc %rax, %r8
|
|
|
|
mulx 24($ap), %rax, %r11
|
|
adc %rbx, %r9
|
|
|
|
mulx 32($ap), %rbx, %r12
|
|
adc %rax, %r10
|
|
|
|
mulx 40($ap), %rax, %r13
|
|
adc %rbx, %r11
|
|
|
|
mulx 48($ap), %rbx, %r14
|
|
adc %rax, %r12
|
|
|
|
mulx 56($ap), %rax, %r15
|
|
mov 8($bp), %rdx
|
|
adc %rbx, %r13
|
|
adc %rax, %r14
|
|
adc \$0, %r15
|
|
|
|
xor $zero, $zero # cf=0,of=0
|
|
jmp .Loop_mulx
|
|
|
|
.align 32
|
|
.Loop_mulx:
|
|
movq %r8, %rbx
|
|
mulx ($ap), %rax, %r8
|
|
adcx %rax, %rbx
|
|
adox %r9, %r8
|
|
|
|
mulx 8($ap), %rax, %r9
|
|
adcx %rax, %r8
|
|
adox %r10, %r9
|
|
|
|
mulx 16($ap), %rax, %r10
|
|
adcx %rax, %r9
|
|
adox %r11, %r10
|
|
|
|
mulx 24($ap), %rax, %r11
|
|
adcx %rax, %r10
|
|
adox %r12, %r11
|
|
|
|
.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
|
|
adcx %rax, %r11
|
|
adox %r13, %r12
|
|
|
|
mulx 40($ap), %rax, %r13
|
|
adcx %rax, %r12
|
|
adox %r14, %r13
|
|
|
|
mulx 48($ap), %rax, %r14
|
|
adcx %rax, %r13
|
|
adox %r15, %r14
|
|
|
|
mulx 56($ap), %rax, %r15
|
|
movq 64($bp,%rcx,8), %rdx
|
|
movq %rbx, 8+64-8(%rsp,%rcx,8)
|
|
adcx %rax, %r14
|
|
adox $zero, %r15
|
|
adcx $zero, %r15 # cf=0
|
|
|
|
inc %rcx # of=0
|
|
jnz .Loop_mulx
|
|
|
|
movq %r8, %rbx
|
|
mulx ($ap), %rax, %r8
|
|
adcx %rax, %rbx
|
|
adox %r9, %r8
|
|
|
|
.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
|
|
adcx %rax, %r8
|
|
adox %r10, %r9
|
|
|
|
.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
|
|
adcx %rax, %r9
|
|
adox %r11, %r10
|
|
|
|
mulx 24($ap), %rax, %r11
|
|
adcx %rax, %r10
|
|
adox %r12, %r11
|
|
|
|
mulx 32($ap), %rax, %r12
|
|
adcx %rax, %r11
|
|
adox %r13, %r12
|
|
|
|
mulx 40($ap), %rax, %r13
|
|
adcx %rax, %r12
|
|
adox %r14, %r13
|
|
|
|
.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
|
|
adcx %rax, %r13
|
|
adox %r15, %r14
|
|
|
|
.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
|
|
adcx %rax, %r14
|
|
adox $zero, %r15
|
|
adcx $zero, %r15
|
|
|
|
mov %rbx, 8+64-8(%rsp)
|
|
mov %r8, 8+64(%rsp)
|
|
mov %r9, 8+64+8(%rsp)
|
|
mov %r10, 8+64+16(%rsp)
|
|
mov %r11, 8+64+24(%rsp)
|
|
mov %r12, 8+64+32(%rsp)
|
|
mov %r13, 8+64+40(%rsp)
|
|
mov %r14, 8+64+48(%rsp)
|
|
mov %r15, 8+64+56(%rsp)
|
|
|
|
ret
|
|
.cfi_endproc
|
|
.size __rsaz_512_mulx,.-__rsaz_512_mulx
|
|
___
|
|
}
|
|
{
|
|
my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
|
|
$code.=<<___;
|
|
.globl rsaz_512_scatter4
|
|
.type rsaz_512_scatter4,\@abi-omnipotent
|
|
.align 16
|
|
rsaz_512_scatter4:
|
|
.cfi_startproc
|
|
leaq ($out,$power,8), $out
|
|
movl \$8, %r9d
|
|
jmp .Loop_scatter
|
|
.align 16
|
|
.Loop_scatter:
|
|
movq ($inp), %rax
|
|
leaq 8($inp), $inp
|
|
movq %rax, ($out)
|
|
leaq 128($out), $out
|
|
decl %r9d
|
|
jnz .Loop_scatter
|
|
ret
|
|
.cfi_endproc
|
|
.size rsaz_512_scatter4,.-rsaz_512_scatter4
|
|
|
|
.globl rsaz_512_gather4
|
|
.type rsaz_512_gather4,\@abi-omnipotent
|
|
.align 16
|
|
rsaz_512_gather4:
|
|
.cfi_startproc
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
.LSEH_begin_rsaz_512_gather4:
|
|
.byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
|
|
.byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
|
|
.byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
|
|
.byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
|
|
.byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
|
|
.byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
|
|
.byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
|
|
.byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
|
|
.byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
|
|
.byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
|
|
.byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
|
|
___
|
|
$code.=<<___;
|
|
movd $power,%xmm8
|
|
movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
|
|
movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
|
|
|
|
pshufd \$0,%xmm8,%xmm8 # broadcast $power
|
|
movdqa %xmm1,%xmm7
|
|
movdqa %xmm1,%xmm2
|
|
___
|
|
########################################################################
|
|
# calculate mask by comparing 0..15 to $power
|
|
#
|
|
for($i=0;$i<4;$i++) {
|
|
$code.=<<___;
|
|
paddd %xmm`$i`,%xmm`$i+1`
|
|
pcmpeqd %xmm8,%xmm`$i`
|
|
movdqa %xmm7,%xmm`$i+3`
|
|
___
|
|
}
|
|
for(;$i<7;$i++) {
|
|
$code.=<<___;
|
|
paddd %xmm`$i`,%xmm`$i+1`
|
|
pcmpeqd %xmm8,%xmm`$i`
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
pcmpeqd %xmm8,%xmm7
|
|
movl \$8, %r9d
|
|
jmp .Loop_gather
|
|
.align 16
|
|
.Loop_gather:
|
|
movdqa 16*0($inp),%xmm8
|
|
movdqa 16*1($inp),%xmm9
|
|
movdqa 16*2($inp),%xmm10
|
|
movdqa 16*3($inp),%xmm11
|
|
pand %xmm0,%xmm8
|
|
movdqa 16*4($inp),%xmm12
|
|
pand %xmm1,%xmm9
|
|
movdqa 16*5($inp),%xmm13
|
|
pand %xmm2,%xmm10
|
|
movdqa 16*6($inp),%xmm14
|
|
pand %xmm3,%xmm11
|
|
movdqa 16*7($inp),%xmm15
|
|
leaq 128($inp), $inp
|
|
pand %xmm4,%xmm12
|
|
pand %xmm5,%xmm13
|
|
pand %xmm6,%xmm14
|
|
pand %xmm7,%xmm15
|
|
por %xmm10,%xmm8
|
|
por %xmm11,%xmm9
|
|
por %xmm12,%xmm8
|
|
por %xmm13,%xmm9
|
|
por %xmm14,%xmm8
|
|
por %xmm15,%xmm9
|
|
|
|
por %xmm9,%xmm8
|
|
pshufd \$0x4e,%xmm8,%xmm9
|
|
por %xmm9,%xmm8
|
|
movq %xmm8,($out)
|
|
leaq 8($out), $out
|
|
decl %r9d
|
|
jnz .Loop_gather
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
movaps 0x00(%rsp),%xmm6
|
|
movaps 0x10(%rsp),%xmm7
|
|
movaps 0x20(%rsp),%xmm8
|
|
movaps 0x30(%rsp),%xmm9
|
|
movaps 0x40(%rsp),%xmm10
|
|
movaps 0x50(%rsp),%xmm11
|
|
movaps 0x60(%rsp),%xmm12
|
|
movaps 0x70(%rsp),%xmm13
|
|
movaps 0x80(%rsp),%xmm14
|
|
movaps 0x90(%rsp),%xmm15
|
|
add \$0xa8,%rsp
|
|
___
|
|
$code.=<<___;
|
|
ret
|
|
.LSEH_end_rsaz_512_gather4:
|
|
.cfi_endproc
|
|
.size rsaz_512_gather4,.-rsaz_512_gather4
|
|
|
|
.align 64
|
|
.Linc:
|
|
.long 0,0, 1,1
|
|
.long 2,2, 2,2
|
|
___
|
|
}
|
|
|
|
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
|
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
|
if ($win64) {
|
|
$rec="%rcx";
|
|
$frame="%rdx";
|
|
$context="%r8";
|
|
$disp="%r9";
|
|
|
|
$code.=<<___;
|
|
.extern __imp_RtlVirtualUnwind
|
|
.type se_handler,\@abi-omnipotent
|
|
.align 16
|
|
se_handler:
|
|
push %rsi
|
|
push %rdi
|
|
push %rbx
|
|
push %rbp
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
pushfq
|
|
sub \$64,%rsp
|
|
|
|
mov 120($context),%rax # pull context->Rax
|
|
mov 248($context),%rbx # pull context->Rip
|
|
|
|
mov 8($disp),%rsi # disp->ImageBase
|
|
mov 56($disp),%r11 # disp->HandlerData
|
|
|
|
mov 0(%r11),%r10d # HandlerData[0]
|
|
lea (%rsi,%r10),%r10 # end of prologue label
|
|
cmp %r10,%rbx # context->Rip<end of prologue label
|
|
jb .Lcommon_seh_tail
|
|
|
|
mov 152($context),%rax # pull context->Rsp
|
|
|
|
mov 4(%r11),%r10d # HandlerData[1]
|
|
lea (%rsi,%r10),%r10 # epilogue label
|
|
cmp %r10,%rbx # context->Rip>=epilogue label
|
|
jae .Lcommon_seh_tail
|
|
|
|
lea 128+24+48(%rax),%rax
|
|
|
|
lea .Lmul_gather4_epilogue(%rip),%rbx
|
|
cmp %r10,%rbx
|
|
jne .Lse_not_in_mul_gather4
|
|
|
|
lea 0xb0(%rax),%rax
|
|
|
|
lea -48-0xa8(%rax),%rsi
|
|
lea 512($context),%rdi
|
|
mov \$20,%ecx
|
|
.long 0xa548f3fc # cld; rep movsq
|
|
|
|
.Lse_not_in_mul_gather4:
|
|
mov -8(%rax),%rbx
|
|
mov -16(%rax),%rbp
|
|
mov -24(%rax),%r12
|
|
mov -32(%rax),%r13
|
|
mov -40(%rax),%r14
|
|
mov -48(%rax),%r15
|
|
mov %rbx,144($context) # restore context->Rbx
|
|
mov %rbp,160($context) # restore context->Rbp
|
|
mov %r12,216($context) # restore context->R12
|
|
mov %r13,224($context) # restore context->R13
|
|
mov %r14,232($context) # restore context->R14
|
|
mov %r15,240($context) # restore context->R15
|
|
|
|
.Lcommon_seh_tail:
|
|
mov 8(%rax),%rdi
|
|
mov 16(%rax),%rsi
|
|
mov %rax,152($context) # restore context->Rsp
|
|
mov %rsi,168($context) # restore context->Rsi
|
|
mov %rdi,176($context) # restore context->Rdi
|
|
|
|
mov 40($disp),%rdi # disp->ContextRecord
|
|
mov $context,%rsi # context
|
|
mov \$154,%ecx # sizeof(CONTEXT)
|
|
.long 0xa548f3fc # cld; rep movsq
|
|
|
|
mov $disp,%rsi
|
|
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
|
mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
|
mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
|
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
|
mov 40(%rsi),%r10 # disp->ContextRecord
|
|
lea 56(%rsi),%r11 # &disp->HandlerData
|
|
lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
|
mov %r10,32(%rsp) # arg5
|
|
mov %r11,40(%rsp) # arg6
|
|
mov %r12,48(%rsp) # arg7
|
|
mov %rcx,56(%rsp) # arg8, (NULL)
|
|
call *__imp_RtlVirtualUnwind(%rip)
|
|
|
|
mov \$1,%eax # ExceptionContinueSearch
|
|
add \$64,%rsp
|
|
popfq
|
|
pop %r15
|
|
pop %r14
|
|
pop %r13
|
|
pop %r12
|
|
pop %rbp
|
|
pop %rbx
|
|
pop %rdi
|
|
pop %rsi
|
|
ret
|
|
.size se_handler,.-se_handler
|
|
|
|
.section .pdata
|
|
.align 4
|
|
.rva .LSEH_begin_rsaz_512_sqr
|
|
.rva .LSEH_end_rsaz_512_sqr
|
|
.rva .LSEH_info_rsaz_512_sqr
|
|
|
|
.rva .LSEH_begin_rsaz_512_mul
|
|
.rva .LSEH_end_rsaz_512_mul
|
|
.rva .LSEH_info_rsaz_512_mul
|
|
|
|
.rva .LSEH_begin_rsaz_512_mul_gather4
|
|
.rva .LSEH_end_rsaz_512_mul_gather4
|
|
.rva .LSEH_info_rsaz_512_mul_gather4
|
|
|
|
.rva .LSEH_begin_rsaz_512_mul_scatter4
|
|
.rva .LSEH_end_rsaz_512_mul_scatter4
|
|
.rva .LSEH_info_rsaz_512_mul_scatter4
|
|
|
|
.rva .LSEH_begin_rsaz_512_mul_by_one
|
|
.rva .LSEH_end_rsaz_512_mul_by_one
|
|
.rva .LSEH_info_rsaz_512_mul_by_one
|
|
|
|
.rva .LSEH_begin_rsaz_512_gather4
|
|
.rva .LSEH_end_rsaz_512_gather4
|
|
.rva .LSEH_info_rsaz_512_gather4
|
|
|
|
.section .xdata
|
|
.align 8
|
|
.LSEH_info_rsaz_512_sqr:
|
|
.byte 9,0,0,0
|
|
.rva se_handler
|
|
.rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
|
|
.LSEH_info_rsaz_512_mul:
|
|
.byte 9,0,0,0
|
|
.rva se_handler
|
|
.rva .Lmul_body,.Lmul_epilogue # HandlerData[]
|
|
.LSEH_info_rsaz_512_mul_gather4:
|
|
.byte 9,0,0,0
|
|
.rva se_handler
|
|
.rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
|
|
.LSEH_info_rsaz_512_mul_scatter4:
|
|
.byte 9,0,0,0
|
|
.rva se_handler
|
|
.rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
|
|
.LSEH_info_rsaz_512_mul_by_one:
|
|
.byte 9,0,0,0
|
|
.rva se_handler
|
|
.rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
|
|
.LSEH_info_rsaz_512_gather4:
|
|
.byte 0x01,0x46,0x16,0x00
|
|
.byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
|
|
.byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
|
|
.byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
|
|
.byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
|
|
.byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
|
|
.byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
|
|
.byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
|
|
.byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
|
|
.byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
|
|
.byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
|
|
.byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
|
|
___
|
|
}
|
|
|
|
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
|
print $code;
|
|
close STDOUT or die "error closing STDOUT";
|