openssl/crypto/modes/asm/ghash-riscv64-zvkb-zvbc.pl
Jerry Shih 3645eb0be2 Update for Zvkb extension.
c8ddeb7e64/doc/vector/riscv-crypto-vector-zvkb.adoc
Create `RISCV_HAS_ZVKB()` macro.
Use zvkb for SM4 instead of zvbb.
Use zvkb for ghash instead of zvbb.
We could just use the zvbb's subset `zvkb` for flexibility.

Signed-off-by: Jerry Shih <jerry.shih@sifive.com>
Signed-off-by: Phoebe Chen <phoebe.chen@sifive.com>

Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
Reviewed-by: Hugo Landau <hlandau@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/21923)
2023-10-26 15:55:50 +01:00

379 lines
13 KiB
Perl

#! /usr/bin/env perl
# This file is dual-licensed, meaning that you can use it under your
# choice of either of the following two licenses:
#
# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You can obtain
# a copy in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# or
#
# Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# - RV64I
# - RISC-V Vector ('V') with VLEN >= 128
# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
# - RISC-V Vector Carryless Multiplication extension ('Zvbc')
use strict;
use warnings;
use FindBin qw($Bin);
use lib "$Bin";
use lib "$Bin/../../perlasm";
use riscv;
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$output and open STDOUT,">$output";
my $code=<<___;
.text
___
################################################################################
# void gcm_init_rv64i_zvkb_zvbc(u128 Htable[16], const u64 H[2]);
#
# input: H: 128-bit H - secret parameter E(K, 0^128)
# output: Htable: Preprocessed key data for gcm_gmult_rv64i_zvkb_zvbc and
# gcm_ghash_rv64i_zvkb_zvbc
{
my ($Htable,$H,$TMP0,$TMP1,$TMP2) = ("a0","a1","t0","t1","t2");
my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");
$code .= <<___;
.p2align 3
.globl gcm_init_rv64i_zvkb_zvbc
.type gcm_init_rv64i_zvkb_zvbc,\@function
gcm_init_rv64i_zvkb_zvbc:
# Load/store data in reverse order.
# This is needed as a part of endianness swap.
add $H, $H, 8
li $TMP0, -8
li $TMP1, 63
la $TMP2, Lpolymod
@{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
@{[vlse64_v $V1, $H, $TMP0]} # vlse64.v v1, (a1), t0
@{[vle64_v $V2, $TMP2]} # vle64.v v2, (t2)
# Shift one left and get the carry bits.
@{[vsrl_vx $V3, $V1, $TMP1]} # vsrl.vx v3, v1, t1
@{[vsll_vi $V1, $V1, 1]} # vsll.vi v1, v1, 1
# Use the fact that the polynomial degree is no more than 128,
# i.e. only the LSB of the upper half could be set.
# Thanks to this we don't need to do the full reduction here.
# Instead simply subtract the reduction polynomial.
# This idea was taken from x86 ghash implementation in OpenSSL.
@{[vslideup_vi $V4, $V3, 1]} # vslideup.vi v4, v3, 1
@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
@{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
@{[vor_vv_v0t $V1, $V1, $V4]} # vor.vv v1, v1, v4, v0.t
# Need to set the mask to 3, if the carry bit is set.
@{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3
@{[vmv_v_i $V3, 0]} # vmv.v.i v3, 0
@{[vmerge_vim $V3, $V3, 3]} # vmerge.vim v3, v3, 3, v0
@{[vmv_v_v $V0, $V3]} # vmv.v.v v0, v3
@{[vxor_vv_v0t $V1, $V1, $V2]} # vxor.vv v1, v1, v2, v0.t
@{[vse64_v $V1, $Htable]} # vse64.v v1, (a0)
ret
.size gcm_init_rv64i_zvkb_zvbc,.-gcm_init_rv64i_zvkb_zvbc
___
}
################################################################################
# void gcm_gmult_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16]);
#
# input: Xi: current hash value
# Htable: preprocessed H
# output: Xi: next hash value Xi = (Xi * H mod f)
{
my ($Xi,$Htable,$TMP0,$TMP1,$TMP2,$TMP3,$TMP4) = ("a0","a1","t0","t1","t2","t3","t4");
my ($V0,$V1,$V2,$V3,$V4,$V5,$V6) = ("v0","v1","v2","v3","v4","v5","v6");
$code .= <<___;
.text
.p2align 3
.globl gcm_gmult_rv64i_zvkb_zvbc
.type gcm_gmult_rv64i_zvkb_zvbc,\@function
gcm_gmult_rv64i_zvkb_zvbc:
ld $TMP0, ($Htable)
ld $TMP1, 8($Htable)
li $TMP2, 63
la $TMP3, Lpolymod
ld $TMP3, 8($TMP3)
# Load/store data in reverse order.
# This is needed as a part of endianness swap.
add $Xi, $Xi, 8
li $TMP4, -8
@{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
@{[vlse64_v $V5, $Xi, $TMP4]} # vlse64.v v5, (a0), t4
@{[vrev8_v $V5, $V5]} # vrev8.v v5, v5
# Multiplication
# Do two 64x64 multiplications in one go to save some time
# and simplify things.
# A = a1a0 (t1, t0)
# B = b1b0 (v5)
# C = c1c0 (256 bit)
# c1 = a1b1 + (a0b1)h + (a1b0)h
# c0 = a0b0 + (a0b1)l + (a1b0)h
# v1 = (a0b1)l,(a0b0)l
@{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0
# v3 = (a0b1)h,(a0b0)h
@{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0
# v4 = (a1b1)l,(a1b0)l
@{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1
# v2 = (a1b1)h,(a1b0)h
@{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1
# Is there a better way to do this?
# Would need to swap the order of elements within a vector register.
@{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1
@{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1
@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
@{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1
@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
# v2 += (a0b1)h
@{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t
# v2 += (a1b1)l
@{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t
@{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
# v1 += (a0b0)h,0
@{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t
# v1 += (a1b0)l,0
@{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t
# Now the 256bit product should be stored in (v2,v1)
# v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
# v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
# Reduction
# Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
# This is a slight variation of the Gueron's Montgomery reduction.
# The difference being the order of some operations has been changed,
# to make a better use of vclmul(h) instructions.
# First step:
# c1 += (c0 * P)l
# vmv.v.i v0, 2
@{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
@{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
# Second step:
# D = d1,d0 is final result
# We want:
# m1 = c1 + (c1 * P)h
# m0 = (c1 * P)l + (c0 * P)h + c0
# d1 = c3 + m1
# d0 = c2 + m0
#v3 = (c1 * P)l, 0
@{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
#v4 = (c1 * P)h, (c0 * P)h
@{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3
@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
@{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4
@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
# XOR in the upper upper part of the product
@{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1
@{[vrev8_v $V2, $V2]} # vrev8.v v2, v2
@{[vsse64_v $V2, $Xi, $TMP4]} # vsse64.v v2, (a0), t4
ret
.size gcm_gmult_rv64i_zvkb_zvbc,.-gcm_gmult_rv64i_zvkb_zvbc
___
}
################################################################################
# void gcm_ghash_rv64i_zvkb_zvbc(u64 Xi[2], const u128 Htable[16],
# const u8 *inp, size_t len);
#
# input: Xi: current hash value
# Htable: preprocessed H
# inp: pointer to input data
# len: length of input data in bytes (multiple of block size)
# output: Xi: Xi+1 (next hash value Xi)
{
my ($Xi,$Htable,$inp,$len,$TMP0,$TMP1,$TMP2,$TMP3,$M8,$TMP5,$TMP6) = ("a0","a1","a2","a3","t0","t1","t2","t3","t4","t5","t6");
my ($V0,$V1,$V2,$V3,$V4,$V5,$V6,$Vinp) = ("v0","v1","v2","v3","v4","v5","v6","v7");
$code .= <<___;
.p2align 3
.globl gcm_ghash_rv64i_zvkb_zvbc
.type gcm_ghash_rv64i_zvkb_zvbc,\@function
gcm_ghash_rv64i_zvkb_zvbc:
ld $TMP0, ($Htable)
ld $TMP1, 8($Htable)
li $TMP2, 63
la $TMP3, Lpolymod
ld $TMP3, 8($TMP3)
# Load/store data in reverse order.
# This is needed as a part of endianness swap.
add $Xi, $Xi, 8
add $inp, $inp, 8
li $M8, -8
@{[vsetivli__x0_2_e64_m1_tu_mu]} # vsetivli x0, 2, e64, m1, tu, mu
@{[vlse64_v $V5, $Xi, $M8]} # vlse64.v v5, (a0), t4
Lstep:
# Read input data
@{[vlse64_v $Vinp, $inp, $M8]} # vle64.v v0, (a2)
add $inp, $inp, 16
add $len, $len, -16
# XOR them into Xi
@{[vxor_vv $V5, $V5, $Vinp]} # vxor.vv v0, v0, v1
@{[vrev8_v $V5, $V5]} # vrev8.v v5, v5
# Multiplication
# Do two 64x64 multiplications in one go to save some time
# and simplify things.
# A = a1a0 (t1, t0)
# B = b1b0 (v5)
# C = c1c0 (256 bit)
# c1 = a1b1 + (a0b1)h + (a1b0)h
# c0 = a0b0 + (a0b1)l + (a1b0)h
# v1 = (a0b1)l,(a0b0)l
@{[vclmul_vx $V1, $V5, $TMP0]} # vclmul.vx v1, v5, t0
# v3 = (a0b1)h,(a0b0)h
@{[vclmulh_vx $V3, $V5, $TMP0]} # vclmulh.vx v3, v5, t0
# v4 = (a1b1)l,(a1b0)l
@{[vclmul_vx $V4, $V5, $TMP1]} # vclmul.vx v4, v5, t1
# v2 = (a1b1)h,(a1b0)h
@{[vclmulh_vx $V2, $V5, $TMP1]} # vclmulh.vx v2, v5, t1
# Is there a better way to do this?
# Would need to swap the order of elements within a vector register.
@{[vslideup_vi $V5, $V3, 1]} # vslideup.vi v5, v3, 1
@{[vslideup_vi $V6, $V4, 1]} # vslideup.vi v6, v4, 1
@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
@{[vslidedown_vi $V4, $V4, 1]} # vslidedown.vi v4, v4, 1
@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
# v2 += (a0b1)h
@{[vxor_vv_v0t $V2, $V2, $V3]} # vxor.vv v2, v2, v3, v0.t
# v2 += (a1b1)l
@{[vxor_vv_v0t $V2, $V2, $V4]} # vxor.vv v2, v2, v4, v0.t
@{[vmv_v_i $V0, 2]} # vmv.v.i v0, 2
# v1 += (a0b0)h,0
@{[vxor_vv_v0t $V1, $V1, $V5]} # vxor.vv v1, v1, v5, v0.t
# v1 += (a1b0)l,0
@{[vxor_vv_v0t $V1, $V1, $V6]} # vxor.vv v1, v1, v6, v0.t
# Now the 256bit product should be stored in (v2,v1)
# v1 = (a0b1)l + (a0b0)h + (a1b0)l, (a0b0)l
# v2 = (a1b1)h, (a1b0)h + (a0b1)h + (a1b1)l
# Reduction
# Let C := A*B = c3,c2,c1,c0 = v2[1],v2[0],v1[1],v1[0]
# This is a slight variation of the Gueron's Montgomery reduction.
# The difference being the order of some operations has been changed,
# to make a better use of vclmul(h) instructions.
# First step:
# c1 += (c0 * P)l
# vmv.v.i v0, 2
@{[vslideup_vi_v0t $V3, $V1, 1]} # vslideup.vi v3, v1, 1, v0.t
@{[vclmul_vx_v0t $V3, $V3, $TMP3]} # vclmul.vx v3, v3, t3, v0.t
@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
# Second step:
# D = d1,d0 is final result
# We want:
# m1 = c1 + (c1 * P)h
# m0 = (c1 * P)l + (c0 * P)h + c0
# d1 = c3 + m1
# d0 = c2 + m0
#v3 = (c1 * P)l, 0
@{[vclmul_vx_v0t $V3, $V1, $TMP3]} # vclmul.vx v3, v1, t3, v0.t
#v4 = (c1 * P)h, (c0 * P)h
@{[vclmulh_vx $V4, $V1, $TMP3]} # vclmulh.vx v4, v1, t3
@{[vmv_v_i $V0, 1]} # vmv.v.i v0, 1
@{[vslidedown_vi $V3, $V3, 1]} # vslidedown.vi v3, v3, 1
@{[vxor_vv $V1, $V1, $V4]} # vxor.vv v1, v1, v4
@{[vxor_vv_v0t $V1, $V1, $V3]} # vxor.vv v1, v1, v3, v0.t
# XOR in the upper upper part of the product
@{[vxor_vv $V2, $V2, $V1]} # vxor.vv v2, v2, v1
@{[vrev8_v $V5, $V2]} # vrev8.v v2, v2
bnez $len, Lstep
@{[vsse64_v $V5, $Xi, $M8]} # vsse64.v v2, (a0), t4
ret
.size gcm_ghash_rv64i_zvkb_zvbc,.-gcm_ghash_rv64i_zvkb_zvbc
___
}
$code .= <<___;
.p2align 4
Lpolymod:
.dword 0x0000000000000001
.dword 0xc200000000000000
.size Lpolymod,.-Lpolymod
___
print $code;
close STDOUT or die "error closing STDOUT: $!";