mirror of
https://github.com/openssl/openssl.git
synced 2025-01-30 14:01:55 +08:00
Run util/openssl-format-source on the Curve448 code
Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de> (Merged from https://github.com/openssl/openssl/pull/5105)
This commit is contained in:
parent
1308e022e1
commit
205fd63881
@ -11,20 +11,21 @@
|
||||
*/
|
||||
|
||||
#ifndef __ARCH_ARCH_32_ARCH_INTRINSICS_H__
|
||||
#define __ARCH_ARCH_32_ARCH_INTRINSICS_H__
|
||||
# define __ARCH_ARCH_32_ARCH_INTRINSICS_H__
|
||||
|
||||
#define ARCH_WORD_BITS 32
|
||||
# define ARCH_WORD_BITS 32
|
||||
|
||||
static __inline__ __attribute((always_inline,unused))
|
||||
uint32_t word_is_zero(uint32_t a) {
|
||||
static __inline__ __attribute((always_inline, unused))
|
||||
uint32_t word_is_zero(uint32_t a)
|
||||
{
|
||||
/* let's hope the compiler isn't clever enough to optimize this. */
|
||||
return (((uint64_t)a)-1)>>32;
|
||||
return (((uint64_t)a) - 1) >> 32;
|
||||
}
|
||||
|
||||
static __inline__ __attribute((always_inline,unused))
|
||||
uint64_t widemul(uint32_t a, uint32_t b) {
|
||||
static __inline__ __attribute((always_inline, unused))
|
||||
uint64_t widemul(uint32_t a, uint32_t b)
|
||||
{
|
||||
return ((uint64_t)a) * b;
|
||||
}
|
||||
|
||||
#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
|
||||
|
||||
#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
|
||||
|
@ -14,84 +14,80 @@
|
||||
|
||||
#if (defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) && !I_HATE_UNROLLED_LOOPS) \
|
||||
|| defined(DECAF_FORCE_UNROLL)
|
||||
#define REPEAT8(_x) _x _x _x _x _x _x _x _x
|
||||
#define FOR_LIMB(_i,_start,_end,_x) do { _i=_start; REPEAT8( if (_i<_end) { _x; } _i++;) } while (0)
|
||||
# define REPEAT8(_x) _x _x _x _x _x _x _x _x
|
||||
# define FOR_LIMB(_i,_start,_end,_x) do { _i=_start; REPEAT8( if (_i<_end) { _x; } _i++;) } while (0)
|
||||
#else
|
||||
#define FOR_LIMB(_i,_start,_end,_x) do { for (_i=_start; _i<_end; _i++) _x; } while (0)
|
||||
# define FOR_LIMB(_i,_start,_end,_x) do { for (_i=_start; _i<_end; _i++) _x; } while (0)
|
||||
#endif
|
||||
|
||||
void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
|
||||
{
|
||||
const uint32_t *a = as->limb, *b = bs->limb;
|
||||
uint32_t *c = cs->limb;
|
||||
|
||||
uint64_t accum0 = 0, accum1 = 0, accum2 = 0;
|
||||
uint32_t mask = (1<<28) - 1;
|
||||
uint32_t mask = (1 << 28) - 1;
|
||||
|
||||
uint32_t aa[8], bb[8];
|
||||
|
||||
int i,j;
|
||||
for (i=0; i<8; i++) {
|
||||
aa[i] = a[i] + a[i+8];
|
||||
bb[i] = b[i] + b[i+8];
|
||||
|
||||
int i, j;
|
||||
for (i = 0; i < 8; i++) {
|
||||
aa[i] = a[i] + a[i + 8];
|
||||
bb[i] = b[i] + b[i + 8];
|
||||
}
|
||||
|
||||
FOR_LIMB(j,0,8,{
|
||||
accum2 = 0;
|
||||
|
||||
FOR_LIMB (i,0,j+1,{
|
||||
accum2 += widemul(a[j-i],b[i]);
|
||||
accum1 += widemul(aa[j-i],bb[i]);
|
||||
accum0 += widemul(a[8+j-i], b[8+i]);
|
||||
});
|
||||
|
||||
accum1 -= accum2;
|
||||
accum0 += accum2;
|
||||
accum2 = 0;
|
||||
|
||||
FOR_LIMB (i,j+1,8,{
|
||||
accum0 -= widemul(a[8+j-i], b[i]);
|
||||
accum2 += widemul(aa[8+j-i], bb[i]);
|
||||
accum1 += widemul(a[16+j-i], b[8+i]);
|
||||
});
|
||||
|
||||
accum1 += accum2;
|
||||
accum0 += accum2;
|
||||
FOR_LIMB(j, 0, 8, {
|
||||
accum2 = 0;
|
||||
FOR_LIMB(i, 0, j + 1, {
|
||||
accum2 += widemul(a[j - i], b[i]);
|
||||
accum1 += widemul(aa[j - i], bb[i]);
|
||||
accum0 += widemul(a[8 + j - i], b[8 + i]);
|
||||
}
|
||||
); accum1 -= accum2; accum0 += accum2;
|
||||
accum2 = 0;
|
||||
FOR_LIMB(i, j + 1, 8, {
|
||||
accum0 -=
|
||||
widemul(a[8 + j - i], b[i]);
|
||||
accum2 +=
|
||||
widemul(aa[8 + j - i],
|
||||
bb[i]);
|
||||
accum1 += widemul(a[16 + j - i], b[8 + i]);
|
||||
}
|
||||
);
|
||||
accum1 += accum2;
|
||||
accum0 += accum2;
|
||||
c[j] = ((uint32_t)(accum0)) & mask;
|
||||
c[j + 8] = ((uint32_t)(accum1)) & mask;
|
||||
accum0 >>= 28; accum1 >>= 28;
|
||||
});
|
||||
|
||||
c[j] = ((uint32_t)(accum0)) & mask;
|
||||
c[j+8] = ((uint32_t)(accum1)) & mask;
|
||||
|
||||
accum0 >>= 28;
|
||||
accum1 >>= 28;
|
||||
});
|
||||
|
||||
accum0 += accum1;
|
||||
accum0 += c[8];
|
||||
accum1 += c[0];
|
||||
c[8] = ((uint32_t)(accum0)) & mask;
|
||||
c[0] = ((uint32_t)(accum1)) & mask;
|
||||
|
||||
|
||||
accum0 >>= 28;
|
||||
accum1 >>= 28;
|
||||
c[9] += ((uint32_t)(accum0));
|
||||
c[1] += ((uint32_t)(accum1));
|
||||
}
|
||||
|
||||
void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
|
||||
void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
|
||||
{
|
||||
const uint32_t *a = as->limb;
|
||||
uint32_t *c = cs->limb;
|
||||
uint64_t accum0 = 0, accum8 = 0;
|
||||
uint32_t mask = (1<<28)-1;
|
||||
uint32_t mask = (1 << 28) - 1;
|
||||
int i;
|
||||
|
||||
assert(b<1<<28);
|
||||
assert(b < 1 << 28);
|
||||
|
||||
FOR_LIMB(i,0,8,{
|
||||
accum0 += widemul(b, a[i]);
|
||||
accum8 += widemul(b, a[i+8]);
|
||||
|
||||
c[i] = accum0 & mask; accum0 >>= 28;
|
||||
c[i+8] = accum8 & mask; accum8 >>= 28;
|
||||
});
|
||||
FOR_LIMB(i, 0, 8, {
|
||||
accum0 += widemul(b, a[i]); accum8 += widemul(b, a[i + 8]);
|
||||
c[i] = accum0 & mask; accum0 >>= 28;
|
||||
c[i + 8] = accum8 & mask; accum8 >>= 28;
|
||||
});
|
||||
|
||||
accum0 += accum8 + c[8];
|
||||
c[8] = accum0 & mask;
|
||||
@ -102,7 +98,7 @@ void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
|
||||
c[1] += accum8 >> 28;
|
||||
}
|
||||
|
||||
void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
gf_mul(cs,as,as); /* Performs better with a dedicated square */
|
||||
void gf_sqr(gf_s * __restrict__ cs, const gf as)
|
||||
{
|
||||
gf_mul(cs, as, as); /* Performs better with a dedicated square */
|
||||
}
|
||||
|
||||
|
@ -13,43 +13,46 @@
|
||||
#define LIMB(x) (x)&((1<<28)-1), (x)>>28
|
||||
#define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
|
||||
{{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}}
|
||||
|
||||
|
||||
#define LIMB_PLACE_VALUE(i) 28
|
||||
|
||||
void gf_add_RAW (gf out, const gf a, const gf b) {
|
||||
void gf_add_RAW(gf out, const gf a, const gf b)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
|
||||
for (i = 0; i < sizeof(*out) / sizeof(out->limb[0]); i++) {
|
||||
out->limb[i] = a->limb[i] + b->limb[i];
|
||||
}
|
||||
}
|
||||
|
||||
void gf_sub_RAW (gf out, const gf a, const gf b) {
|
||||
void gf_sub_RAW(gf out, const gf a, const gf b)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
|
||||
for (i = 0; i < sizeof(*out) / sizeof(out->limb[0]); i++) {
|
||||
out->limb[i] = a->limb[i] - b->limb[i];
|
||||
}
|
||||
}
|
||||
|
||||
void gf_bias (gf a, int amt) {
|
||||
void gf_bias(gf a, int amt)
|
||||
{
|
||||
unsigned int i;
|
||||
uint32_t co1 = ((1<<28)-1)*amt, co2 = co1-amt;
|
||||
uint32_t co1 = ((1 << 28) - 1) * amt, co2 = co1 - amt;
|
||||
|
||||
for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
|
||||
a->limb[i] += (i==sizeof(*a)/sizeof(a->limb[0])/2) ? co2 : co1;
|
||||
for (i = 0; i < sizeof(*a) / sizeof(a->limb[0]); i++) {
|
||||
a->limb[i] += (i == sizeof(*a) / sizeof(a->limb[0]) / 2) ? co2 : co1;
|
||||
}
|
||||
}
|
||||
|
||||
void gf_weak_reduce (gf a) {
|
||||
uint32_t mask = (1<<28) - 1;
|
||||
void gf_weak_reduce(gf a)
|
||||
{
|
||||
uint32_t mask = (1 << 28) - 1;
|
||||
uint32_t tmp = a->limb[15] >> 28;
|
||||
unsigned int i;
|
||||
|
||||
a->limb[8] += tmp;
|
||||
for (i=15; i>0; i--) {
|
||||
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);
|
||||
for (i = 15; i > 0; i--) {
|
||||
a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 28);
|
||||
}
|
||||
a->limb[0] = (a->limb[0] & mask) + tmp;
|
||||
}
|
||||
|
||||
|
@ -11,22 +11,26 @@
|
||||
*/
|
||||
|
||||
#ifndef __ARCH_ARM_32_ARCH_INTRINSICS_H__
|
||||
#define __ARCH_ARM_32_ARCH_INTRINSICS_H__
|
||||
# define __ARCH_ARM_32_ARCH_INTRINSICS_H__
|
||||
|
||||
#define ARCH_WORD_BITS 32
|
||||
# define ARCH_WORD_BITS 32
|
||||
|
||||
static __inline__ __attribute((always_inline,unused))
|
||||
uint32_t word_is_zero(uint32_t a) {
|
||||
static __inline__ __attribute((always_inline, unused))
|
||||
uint32_t word_is_zero(uint32_t a)
|
||||
{
|
||||
uint32_t ret;
|
||||
asm("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc");
|
||||
asm("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
|
||||
return ret;
|
||||
}
|
||||
|
||||
static __inline__ __attribute((always_inline,unused))
|
||||
uint64_t widemul(uint32_t a, uint32_t b) {
|
||||
/* Could be UMULL, but it's hard to express to CC that the registers must be different */
|
||||
return ((uint64_t)a) * b;
|
||||
static __inline__ __attribute((always_inline, unused))
|
||||
uint64_t widemul(uint32_t a, uint32_t b)
|
||||
{
|
||||
/*
|
||||
* Could be UMULL, but it's hard to express to CC that the registers must
|
||||
* be different
|
||||
*/
|
||||
return ((uint64_t)a) * b;
|
||||
}
|
||||
|
||||
#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
|
||||
|
||||
#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
|
||||
|
@ -12,100 +12,89 @@
|
||||
|
||||
#include "f_field.h"
|
||||
|
||||
static inline void __attribute__((gnu_inline,always_inline))
|
||||
smlal (
|
||||
uint64_t *acc,
|
||||
const uint32_t a,
|
||||
const uint32_t b
|
||||
) {
|
||||
static inline void __attribute__ ((gnu_inline, always_inline))
|
||||
smlal(uint64_t *acc, const uint32_t a, const uint32_t b)
|
||||
{
|
||||
|
||||
#ifdef __ARMEL__
|
||||
uint32_t lo = *acc, hi = (*acc)>>32;
|
||||
|
||||
__asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
|
||||
: [lo]"+&r"(lo), [hi]"+&r"(hi)
|
||||
: [a]"r"(a), [b]"r"(b));
|
||||
|
||||
*acc = lo + (((uint64_t)hi)<<32);
|
||||
uint32_t lo = *acc, hi = (*acc) >> 32;
|
||||
|
||||
__asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo),
|
||||
[hi] "+&r"(hi)
|
||||
:[a] "r"(a),[b] "r"(b));
|
||||
|
||||
*acc = lo + (((uint64_t)hi) << 32);
|
||||
#else
|
||||
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
|
||||
*acc += (int64_t)(int32_t)a *(int64_t)(int32_t)b;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void __attribute__((gnu_inline,always_inline))
|
||||
smlal2 (
|
||||
uint64_t *acc,
|
||||
const uint32_t a,
|
||||
const uint32_t b
|
||||
) {
|
||||
static inline void __attribute__ ((gnu_inline, always_inline))
|
||||
smlal2(uint64_t *acc, const uint32_t a, const uint32_t b)
|
||||
{
|
||||
#ifdef __ARMEL__
|
||||
uint32_t lo = *acc, hi = (*acc)>>32;
|
||||
|
||||
__asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
|
||||
: [lo]"+&r"(lo), [hi]"+&r"(hi)
|
||||
: [a]"r"(a), [b]"r"(2*b));
|
||||
|
||||
*acc = lo + (((uint64_t)hi)<<32);
|
||||
uint32_t lo = *acc, hi = (*acc) >> 32;
|
||||
|
||||
__asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo),
|
||||
[hi] "+&r"(hi)
|
||||
:[a] "r"(a),[b] "r"(2 * b));
|
||||
|
||||
*acc = lo + (((uint64_t)hi) << 32);
|
||||
#else
|
||||
*acc += (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2);
|
||||
*acc += (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void __attribute__((gnu_inline,always_inline))
|
||||
smull (
|
||||
uint64_t *acc,
|
||||
const uint32_t a,
|
||||
const uint32_t b
|
||||
) {
|
||||
static inline void __attribute__ ((gnu_inline, always_inline))
|
||||
smull(uint64_t *acc, const uint32_t a, const uint32_t b)
|
||||
{
|
||||
#ifdef __ARMEL__
|
||||
uint32_t lo, hi;
|
||||
|
||||
__asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]"
|
||||
: [lo]"=&r"(lo), [hi]"=&r"(hi)
|
||||
: [a]"r"(a), [b]"r"(b));
|
||||
|
||||
*acc = lo + (((uint64_t)hi)<<32);
|
||||
|
||||
__asm__ __volatile__("smull %[lo], %[hi], %[a], %[b]":[lo] "=&r"(lo),
|
||||
[hi] "=&r"(hi)
|
||||
:[a] "r"(a),[b] "r"(b));
|
||||
|
||||
*acc = lo + (((uint64_t)hi) << 32);
|
||||
#else
|
||||
*acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
|
||||
*acc = (int64_t)(int32_t)a *(int64_t)(int32_t)b;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void __attribute__((gnu_inline,always_inline))
|
||||
smull2 (
|
||||
uint64_t *acc,
|
||||
const uint32_t a,
|
||||
const uint32_t b
|
||||
) {
|
||||
static inline void __attribute__ ((gnu_inline, always_inline))
|
||||
smull2(uint64_t *acc, const uint32_t a, const uint32_t b)
|
||||
{
|
||||
#ifdef __ARMEL__
|
||||
uint32_t lo, hi;
|
||||
|
||||
|
||||
__asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]"
|
||||
: [lo]"=&r"(lo), [hi]"=&r"(hi)
|
||||
: [a]"r"(a), [b]"r"(2*b));
|
||||
|
||||
*acc = lo + (((uint64_t)hi)<<32);
|
||||
: [lo] "=&r"(lo),[hi] "=&r"(hi)
|
||||
: [a] "r"(a),[b] "r"(2 * b));
|
||||
|
||||
*acc = lo + (((uint64_t)hi) << 32);
|
||||
#else
|
||||
*acc = (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2);
|
||||
*acc = (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2);
|
||||
#endif
|
||||
}
|
||||
|
||||
void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
|
||||
void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
|
||||
{
|
||||
|
||||
const uint32_t *a = as->limb, *b = bs->limb;
|
||||
uint32_t *c = cs->limb;
|
||||
|
||||
uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1;
|
||||
uint32_t mask = (1<<28) - 1;
|
||||
uint32_t mask = (1 << 28) - 1;
|
||||
|
||||
uint32_t aa[8], bm[8];
|
||||
|
||||
int i;
|
||||
for (i=0; i<8; i++) {
|
||||
aa[i] = a[i] + a[i+8];
|
||||
bm[i] = b[i] - b[i+8];
|
||||
for (i = 0; i < 8; i++) {
|
||||
aa[i] = a[i] + a[i + 8];
|
||||
bm[i] = b[i] - b[i + 8];
|
||||
}
|
||||
|
||||
uint32_t ax,bx;
|
||||
uint32_t ax, bx;
|
||||
{
|
||||
/* t^3 terms */
|
||||
smull(&accum1, ax = aa[1], bx = b[15]);
|
||||
@ -121,15 +110,15 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
smlal(&accum1, ax, bx = b[10]);
|
||||
smlal(&accum3, ax = aa[7], bx);
|
||||
smlal(&accum1, ax, bx = b[9]);
|
||||
|
||||
|
||||
accum0 = accum1;
|
||||
accum2 = accum3;
|
||||
|
||||
|
||||
/* t^2 terms */
|
||||
smlal(&accum2, ax = aa[0], bx);
|
||||
smlal(&accum0, ax, bx = b[8]);
|
||||
smlal(&accum2, ax = aa[1], bx);
|
||||
|
||||
|
||||
smlal(&accum0, ax = a[9], bx = b[7]);
|
||||
smlal(&accum2, ax = a[10], bx);
|
||||
smlal(&accum0, ax, bx = b[6]);
|
||||
@ -143,14 +132,14 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
smlal(&accum0, ax, bx = b[2]);
|
||||
smlal(&accum2, ax = a[15], bx);
|
||||
smlal(&accum0, ax, bx = b[1]);
|
||||
|
||||
|
||||
/* t terms */
|
||||
accum1 += accum0;
|
||||
accum3 += accum2;
|
||||
smlal(&accum3, ax = a[8], bx);
|
||||
smlal(&accum1, ax, bx = b[0]);
|
||||
smlal(&accum3, ax = a[9], bx);
|
||||
|
||||
|
||||
smlal(&accum1, ax = a[1], bx = bm[7]);
|
||||
smlal(&accum3, ax = a[2], bx);
|
||||
smlal(&accum1, ax, bx = bm[6]);
|
||||
@ -164,20 +153,20 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
smlal(&accum1, ax, bx = bm[2]);
|
||||
smlal(&accum3, ax = a[7], bx);
|
||||
smlal(&accum1, ax, bx = bm[1]);
|
||||
|
||||
|
||||
/* 1 terms */
|
||||
smlal(&accum2, ax = a[0], bx);
|
||||
smlal(&accum0, ax, bx = bm[0]);
|
||||
smlal(&accum2, ax = a[1], bx);
|
||||
|
||||
|
||||
accum2 += accum0 >> 28;
|
||||
accum3 += accum1 >> 28;
|
||||
|
||||
|
||||
c[0] = ((uint32_t)(accum0)) & mask;
|
||||
c[1] = ((uint32_t)(accum2)) & mask;
|
||||
c[8] = ((uint32_t)(accum1)) & mask;
|
||||
c[9] = ((uint32_t)(accum3)) & mask;
|
||||
|
||||
|
||||
accumC0 = accum2 >> 28;
|
||||
accumC1 = accum3 >> 28;
|
||||
}
|
||||
@ -192,10 +181,10 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
smlal(&accum1, ax, bx = b[12]);
|
||||
smlal(&accum3, ax = aa[7], bx);
|
||||
smlal(&accum1, ax, bx = b[11]);
|
||||
|
||||
|
||||
accum0 = accum1;
|
||||
accum2 = accum3;
|
||||
|
||||
|
||||
/* t^2 terms */
|
||||
smlal(&accum2, ax = aa[0], bx);
|
||||
smlal(&accum0, ax, bx = b[10]);
|
||||
@ -204,7 +193,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
smlal(&accum2, ax = aa[2], bx);
|
||||
smlal(&accum0, ax, bx = b[8]);
|
||||
smlal(&accum2, ax = aa[3], bx);
|
||||
|
||||
|
||||
smlal(&accum0, ax = a[11], bx = b[7]);
|
||||
smlal(&accum2, ax = a[12], bx);
|
||||
smlal(&accum0, ax, bx = b[6]);
|
||||
@ -214,7 +203,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
smlal(&accum0, ax, bx = b[4]);
|
||||
smlal(&accum2, ax = a[15], bx);
|
||||
smlal(&accum0, ax, bx = b[3]);
|
||||
|
||||
|
||||
/* t terms */
|
||||
accum1 += accum0;
|
||||
accum3 += accum2;
|
||||
@ -225,7 +214,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
smlal(&accum3, ax = a[10], bx);
|
||||
smlal(&accum1, ax, bx = b[0]);
|
||||
smlal(&accum3, ax = a[11], bx);
|
||||
|
||||
|
||||
smlal(&accum1, ax = a[3], bx = bm[7]);
|
||||
smlal(&accum3, ax = a[4], bx);
|
||||
smlal(&accum1, ax, bx = bm[6]);
|
||||
@ -235,7 +224,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
smlal(&accum1, ax, bx = bm[4]);
|
||||
smlal(&accum3, ax = a[7], bx);
|
||||
smlal(&accum1, ax, bx = bm[3]);
|
||||
|
||||
|
||||
/* 1 terms */
|
||||
smlal(&accum2, ax = a[0], bx);
|
||||
smlal(&accum0, ax, bx = bm[2]);
|
||||
@ -244,34 +233,34 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
smlal(&accum2, ax = a[2], bx);
|
||||
smlal(&accum0, ax, bx = bm[0]);
|
||||
smlal(&accum2, ax = a[3], bx);
|
||||
|
||||
|
||||
accum0 += accumC0;
|
||||
accum1 += accumC1;
|
||||
accum2 += accum0 >> 28;
|
||||
accum3 += accum1 >> 28;
|
||||
|
||||
|
||||
c[2] = ((uint32_t)(accum0)) & mask;
|
||||
c[3] = ((uint32_t)(accum2)) & mask;
|
||||
c[10] = ((uint32_t)(accum1)) & mask;
|
||||
c[11] = ((uint32_t)(accum3)) & mask;
|
||||
|
||||
|
||||
accumC0 = accum2 >> 28;
|
||||
accumC1 = accum3 >> 28;
|
||||
}
|
||||
{
|
||||
|
||||
|
||||
/* t^3 terms */
|
||||
smull(&accum1, ax = aa[5], bx = b[15]);
|
||||
smull(&accum3, ax = aa[6], bx);
|
||||
smlal(&accum1, ax, bx = b[14]);
|
||||
smlal(&accum3, ax = aa[7], bx);
|
||||
smlal(&accum1, ax, bx = b[13]);
|
||||
|
||||
|
||||
accum0 = accum1;
|
||||
accum2 = accum3;
|
||||
|
||||
|
||||
/* t^2 terms */
|
||||
|
||||
|
||||
smlal(&accum2, ax = aa[0], bx);
|
||||
smlal(&accum0, ax, bx = b[12]);
|
||||
smlal(&accum2, ax = aa[1], bx);
|
||||
@ -283,18 +272,17 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
smlal(&accum2, ax = aa[4], bx);
|
||||
smlal(&accum0, ax, bx = b[8]);
|
||||
smlal(&accum2, ax = aa[5], bx);
|
||||
|
||||
|
||||
|
||||
smlal(&accum0, ax = a[13], bx = b[7]);
|
||||
smlal(&accum2, ax = a[14], bx);
|
||||
smlal(&accum0, ax, bx = b[6]);
|
||||
smlal(&accum2, ax = a[15], bx);
|
||||
smlal(&accum0, ax, bx = b[5]);
|
||||
|
||||
|
||||
/* t terms */
|
||||
accum1 += accum0;
|
||||
accum3 += accum2;
|
||||
|
||||
|
||||
smlal(&accum3, ax = a[8], bx);
|
||||
smlal(&accum1, ax, bx = b[4]);
|
||||
smlal(&accum3, ax = a[9], bx);
|
||||
@ -306,16 +294,15 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
smlal(&accum3, ax = a[12], bx);
|
||||
smlal(&accum1, ax, bx = b[0]);
|
||||
smlal(&accum3, ax = a[13], bx);
|
||||
|
||||
|
||||
|
||||
smlal(&accum1, ax = a[5], bx = bm[7]);
|
||||
smlal(&accum3, ax = a[6], bx);
|
||||
smlal(&accum1, ax, bx = bm[6]);
|
||||
smlal(&accum3, ax = a[7], bx);
|
||||
smlal(&accum1, ax, bx = bm[5]);
|
||||
|
||||
|
||||
/* 1 terms */
|
||||
|
||||
|
||||
smlal(&accum2, ax = a[0], bx);
|
||||
smlal(&accum0, ax, bx = bm[4]);
|
||||
smlal(&accum2, ax = a[1], bx);
|
||||
@ -327,28 +314,28 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
smlal(&accum2, ax = a[4], bx);
|
||||
smlal(&accum0, ax, bx = bm[0]);
|
||||
smlal(&accum2, ax = a[5], bx);
|
||||
|
||||
|
||||
accum0 += accumC0;
|
||||
accum1 += accumC1;
|
||||
accum2 += accum0 >> 28;
|
||||
accum3 += accum1 >> 28;
|
||||
|
||||
|
||||
c[4] = ((uint32_t)(accum0)) & mask;
|
||||
c[5] = ((uint32_t)(accum2)) & mask;
|
||||
c[12] = ((uint32_t)(accum1)) & mask;
|
||||
c[13] = ((uint32_t)(accum3)) & mask;
|
||||
|
||||
|
||||
accumC0 = accum2 >> 28;
|
||||
accumC1 = accum3 >> 28;
|
||||
}
|
||||
{
|
||||
|
||||
|
||||
/* t^3 terms */
|
||||
smull(&accum1, ax = aa[7], bx = b[15]);
|
||||
accum0 = accum1;
|
||||
|
||||
|
||||
/* t^2 terms */
|
||||
|
||||
|
||||
smull(&accum2, ax = aa[0], bx);
|
||||
smlal(&accum0, ax, bx = b[14]);
|
||||
smlal(&accum2, ax = aa[1], bx);
|
||||
@ -364,14 +351,13 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
smlal(&accum2, ax = aa[6], bx);
|
||||
smlal(&accum0, ax, bx = b[8]);
|
||||
smlal(&accum2, ax = aa[7], bx);
|
||||
|
||||
|
||||
|
||||
smlal(&accum0, ax = a[15], bx = b[7]);
|
||||
|
||||
|
||||
/* t terms */
|
||||
accum1 += accum0;
|
||||
accum3 = accum2;
|
||||
|
||||
|
||||
smlal(&accum3, ax = a[8], bx);
|
||||
smlal(&accum1, ax, bx = b[6]);
|
||||
smlal(&accum3, ax = a[9], bx);
|
||||
@ -387,12 +373,11 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
smlal(&accum3, ax = a[14], bx);
|
||||
smlal(&accum1, ax, bx = b[0]);
|
||||
smlal(&accum3, ax = a[15], bx);
|
||||
|
||||
|
||||
|
||||
smlal(&accum1, ax = a[7], bx = bm[7]);
|
||||
|
||||
|
||||
/* 1 terms */
|
||||
|
||||
|
||||
smlal(&accum2, ax = a[0], bx);
|
||||
smlal(&accum0, ax, bx = bm[6]);
|
||||
smlal(&accum2, ax = a[1], bx);
|
||||
@ -408,17 +393,17 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
smlal(&accum2, ax = a[6], bx);
|
||||
smlal(&accum0, ax, bx = bm[0]);
|
||||
smlal(&accum2, ax = a[7], bx);
|
||||
|
||||
|
||||
accum0 += accumC0;
|
||||
accum1 += accumC1;
|
||||
accum2 += accum0 >> 28;
|
||||
accum3 += accum1 >> 28;
|
||||
|
||||
|
||||
c[6] = ((uint32_t)(accum0)) & mask;
|
||||
c[7] = ((uint32_t)(accum2)) & mask;
|
||||
c[14] = ((uint32_t)(accum1)) & mask;
|
||||
c[15] = ((uint32_t)(accum3)) & mask;
|
||||
|
||||
|
||||
accum0 = accum2 >> 28;
|
||||
accum1 = accum3 >> 28;
|
||||
}
|
||||
@ -428,28 +413,29 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
accum1 += c[0];
|
||||
c[8] = ((uint32_t)(accum0)) & mask;
|
||||
c[0] = ((uint32_t)(accum1)) & mask;
|
||||
|
||||
|
||||
accum0 >>= 28;
|
||||
accum1 >>= 28;
|
||||
c[9] += ((uint32_t)(accum0));
|
||||
c[1] += ((uint32_t)(accum1));
|
||||
}
|
||||
|
||||
void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
void gf_sqr(gf_s * __restrict__ cs, const gf as)
|
||||
{
|
||||
const uint32_t *a = as->limb;
|
||||
uint32_t *c = cs->limb;
|
||||
|
||||
uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1, tmp;
|
||||
uint32_t mask = (1<<28) - 1;
|
||||
uint32_t mask = (1 << 28) - 1;
|
||||
|
||||
uint32_t bm[8];
|
||||
|
||||
|
||||
int i;
|
||||
for (i=0; i<8; i++) {
|
||||
bm[i] = a[i] - a[i+8];
|
||||
for (i = 0; i < 8; i++) {
|
||||
bm[i] = a[i] - a[i + 8];
|
||||
}
|
||||
|
||||
uint32_t ax,bx;
|
||||
uint32_t ax, bx;
|
||||
{
|
||||
/* t^3 terms */
|
||||
smull2(&accum1, ax = a[9], bx = a[15]);
|
||||
@ -459,14 +445,14 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
smlal2(&accum1, ax, bx = a[13]);
|
||||
smlal2(&accum3, ax = a[12], bx);
|
||||
smlal(&accum1, ax, ax);
|
||||
|
||||
|
||||
accum0 = accum1;
|
||||
accum2 = accum3;
|
||||
|
||||
|
||||
/* t^2 terms */
|
||||
smlal2(&accum2, ax = a[8], a[9]);
|
||||
smlal(&accum0, ax, ax);
|
||||
|
||||
|
||||
smlal2(&accum0, ax = a[1], bx = a[7]);
|
||||
smlal2(&accum2, ax = a[2], bx);
|
||||
smlal2(&accum0, ax, bx = a[6]);
|
||||
@ -474,18 +460,18 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
smlal2(&accum0, ax, bx = a[5]);
|
||||
smlal2(&accum2, ax = a[4], bx);
|
||||
smlal(&accum0, ax, ax);
|
||||
|
||||
|
||||
/* t terms */
|
||||
accum1 += accum0;
|
||||
accum3 += accum2;
|
||||
smlal2(&accum3, ax = a[0], bx = a[1]);
|
||||
smlal(&accum1, ax, ax);
|
||||
|
||||
|
||||
accum1 = -accum1;
|
||||
accum3 = -accum3;
|
||||
accum2 = -accum2;
|
||||
accum0 = -accum0;
|
||||
|
||||
|
||||
smlal2(&accum1, ax = bm[1], bx = bm[7]);
|
||||
smlal2(&accum3, ax = bm[2], bx);
|
||||
smlal2(&accum1, ax, bx = bm[6]);
|
||||
@ -493,22 +479,26 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
smlal2(&accum1, ax, bx = bm[5]);
|
||||
smlal2(&accum3, ax = bm[4], bx);
|
||||
smlal(&accum1, ax, ax);
|
||||
|
||||
|
||||
/* 1 terms */
|
||||
smlal2(&accum2, ax = bm[0], bx = bm[1]);
|
||||
smlal(&accum0, ax, ax);
|
||||
|
||||
tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
|
||||
tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
|
||||
|
||||
|
||||
tmp = -accum3;
|
||||
accum3 = tmp - accum2;
|
||||
accum2 = tmp;
|
||||
tmp = -accum1;
|
||||
accum1 = tmp - accum0;
|
||||
accum0 = tmp;
|
||||
|
||||
accum2 += accum0 >> 28;
|
||||
accum3 += accum1 >> 28;
|
||||
|
||||
|
||||
c[0] = ((uint32_t)(accum0)) & mask;
|
||||
c[1] = ((uint32_t)(accum2)) & mask;
|
||||
c[8] = ((uint32_t)(accum1)) & mask;
|
||||
c[9] = ((uint32_t)(accum3)) & mask;
|
||||
|
||||
|
||||
accumC0 = accum2 >> 28;
|
||||
accumC1 = accum3 >> 28;
|
||||
}
|
||||
@ -519,22 +509,22 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
smlal2(&accum1, ax, bx = a[14]);
|
||||
smlal2(&accum3, ax = a[13], bx);
|
||||
smlal(&accum1, ax, ax);
|
||||
|
||||
|
||||
accum0 = accum1;
|
||||
accum2 = accum3;
|
||||
|
||||
|
||||
/* t^2 terms */
|
||||
smlal2(&accum2, ax = a[8], bx = a[11]);
|
||||
smlal2(&accum0, ax, bx = a[10]);
|
||||
smlal2(&accum2, ax = a[9], bx);
|
||||
smlal(&accum0, ax, ax);
|
||||
|
||||
|
||||
smlal2(&accum0, ax = a[3], bx = a[7]);
|
||||
smlal2(&accum2, ax = a[4], bx);
|
||||
smlal2(&accum0, ax, bx = a[6]);
|
||||
smlal2(&accum2, ax = a[5], bx);
|
||||
smlal(&accum0, ax, ax);
|
||||
|
||||
|
||||
/* t terms */
|
||||
accum1 += accum0;
|
||||
accum3 += accum2;
|
||||
@ -542,119 +532,124 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
smlal2(&accum1, ax, bx = a[2]);
|
||||
smlal2(&accum3, ax = a[1], bx);
|
||||
smlal(&accum1, ax, ax);
|
||||
|
||||
|
||||
accum1 = -accum1;
|
||||
accum3 = -accum3;
|
||||
accum2 = -accum2;
|
||||
accum0 = -accum0;
|
||||
|
||||
|
||||
smlal2(&accum1, ax = bm[3], bx = bm[7]);
|
||||
smlal2(&accum3, ax = bm[4], bx);
|
||||
smlal2(&accum1, ax, bx = bm[6]);
|
||||
smlal2(&accum3, ax = bm[5], bx);
|
||||
smlal(&accum1, ax, ax);
|
||||
|
||||
|
||||
/* 1 terms */
|
||||
smlal2(&accum2, ax = bm[0], bx = bm[3]);
|
||||
smlal2(&accum0, ax, bx = bm[2]);
|
||||
smlal2(&accum2, ax = bm[1], bx);
|
||||
smlal(&accum0, ax, ax);
|
||||
|
||||
|
||||
tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
|
||||
tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
|
||||
|
||||
|
||||
tmp = -accum3;
|
||||
accum3 = tmp - accum2;
|
||||
accum2 = tmp;
|
||||
tmp = -accum1;
|
||||
accum1 = tmp - accum0;
|
||||
accum0 = tmp;
|
||||
|
||||
accum0 += accumC0;
|
||||
accum1 += accumC1;
|
||||
accum2 += accum0 >> 28;
|
||||
accum3 += accum1 >> 28;
|
||||
|
||||
|
||||
c[2] = ((uint32_t)(accum0)) & mask;
|
||||
c[3] = ((uint32_t)(accum2)) & mask;
|
||||
c[10] = ((uint32_t)(accum1)) & mask;
|
||||
c[11] = ((uint32_t)(accum3)) & mask;
|
||||
|
||||
|
||||
accumC0 = accum2 >> 28;
|
||||
accumC1 = accum3 >> 28;
|
||||
}
|
||||
{
|
||||
|
||||
|
||||
/* t^3 terms */
|
||||
smull2(&accum1, ax = a[13], bx = a[15]);
|
||||
smull2(&accum3, ax = a[14], bx);
|
||||
smlal(&accum1, ax, ax);
|
||||
|
||||
|
||||
accum0 = accum1;
|
||||
accum2 = accum3;
|
||||
|
||||
|
||||
/* t^2 terms */
|
||||
|
||||
|
||||
smlal2(&accum2, ax = a[8], bx = a[13]);
|
||||
smlal2(&accum0, ax, bx = a[12]);
|
||||
smlal2(&accum2, ax = a[9], bx);
|
||||
smlal2(&accum0, ax, bx = a[11]);
|
||||
smlal2(&accum2, ax = a[10], bx);
|
||||
smlal(&accum0, ax, ax);
|
||||
|
||||
|
||||
|
||||
smlal2(&accum0, ax = a[5], bx = a[7]);
|
||||
smlal2(&accum2, ax = a[6], bx);
|
||||
smlal(&accum0, ax, ax);
|
||||
|
||||
|
||||
/* t terms */
|
||||
accum1 += accum0;
|
||||
accum3 += accum2;
|
||||
|
||||
|
||||
smlal2(&accum3, ax = a[0], bx = a[5]);
|
||||
smlal2(&accum1, ax, bx = a[4]);
|
||||
smlal2(&accum3, ax = a[1], bx);
|
||||
smlal2(&accum1, ax, bx = a[3]);
|
||||
smlal2(&accum3, ax = a[2], bx);
|
||||
smlal(&accum1, ax, ax);
|
||||
|
||||
|
||||
accum1 = -accum1;
|
||||
accum3 = -accum3;
|
||||
accum2 = -accum2;
|
||||
accum0 = -accum0;
|
||||
|
||||
|
||||
smlal2(&accum1, ax = bm[5], bx = bm[7]);
|
||||
smlal2(&accum3, ax = bm[6], bx);
|
||||
smlal(&accum1, ax, ax);
|
||||
|
||||
|
||||
/* 1 terms */
|
||||
|
||||
|
||||
smlal2(&accum2, ax = bm[0], bx = bm[5]);
|
||||
smlal2(&accum0, ax, bx = bm[4]);
|
||||
smlal2(&accum2, ax = bm[1], bx);
|
||||
smlal2(&accum0, ax, bx = bm[3]);
|
||||
smlal2(&accum2, ax = bm[2], bx);
|
||||
smlal(&accum0, ax, ax);
|
||||
|
||||
|
||||
tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
|
||||
tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
|
||||
|
||||
|
||||
tmp = -accum3;
|
||||
accum3 = tmp - accum2;
|
||||
accum2 = tmp;
|
||||
tmp = -accum1;
|
||||
accum1 = tmp - accum0;
|
||||
accum0 = tmp;
|
||||
|
||||
accum0 += accumC0;
|
||||
accum1 += accumC1;
|
||||
accum2 += accum0 >> 28;
|
||||
accum3 += accum1 >> 28;
|
||||
|
||||
|
||||
c[4] = ((uint32_t)(accum0)) & mask;
|
||||
c[5] = ((uint32_t)(accum2)) & mask;
|
||||
c[12] = ((uint32_t)(accum1)) & mask;
|
||||
c[13] = ((uint32_t)(accum3)) & mask;
|
||||
|
||||
|
||||
accumC0 = accum2 >> 28;
|
||||
accumC1 = accum3 >> 28;
|
||||
}
|
||||
{
|
||||
|
||||
|
||||
/* t^3 terms */
|
||||
smull(&accum1, ax = a[15], bx = a[15]);
|
||||
accum0 = accum1;
|
||||
|
||||
|
||||
/* t^2 terms */
|
||||
|
||||
|
||||
smull2(&accum2, ax = a[8], bx);
|
||||
smlal2(&accum0, ax, bx = a[14]);
|
||||
smlal2(&accum2, ax = a[9], bx);
|
||||
@ -663,14 +658,13 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
smlal2(&accum0, ax, bx = a[12]);
|
||||
smlal2(&accum2, ax = a[11], bx);
|
||||
smlal(&accum0, ax, ax);
|
||||
|
||||
|
||||
|
||||
smlal(&accum0, ax = a[7], bx = a[7]);
|
||||
|
||||
|
||||
/* t terms */
|
||||
accum1 += accum0;
|
||||
accum3 = accum2;
|
||||
|
||||
|
||||
smlal2(&accum3, ax = a[0], bx);
|
||||
smlal2(&accum1, ax, bx = a[6]);
|
||||
smlal2(&accum3, ax = a[1], bx);
|
||||
@ -679,17 +673,17 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
smlal2(&accum1, ax, bx = a[4]);
|
||||
smlal2(&accum3, ax = a[3], bx);
|
||||
smlal(&accum1, ax, ax);
|
||||
|
||||
|
||||
accum1 = -accum1;
|
||||
accum3 = -accum3;
|
||||
accum2 = -accum2;
|
||||
accum0 = -accum0;
|
||||
|
||||
|
||||
bx = bm[7];
|
||||
smlal(&accum1, bx, bx);
|
||||
|
||||
|
||||
/* 1 terms */
|
||||
|
||||
|
||||
smlal2(&accum2, ax = bm[0], bx);
|
||||
smlal2(&accum0, ax, bx = bm[6]);
|
||||
smlal2(&accum2, ax = bm[1], bx);
|
||||
@ -698,21 +692,24 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
smlal2(&accum0, ax, bx = bm[4]);
|
||||
smlal2(&accum2, ax = bm[3], bx);
|
||||
smlal(&accum0, ax, ax);
|
||||
|
||||
tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
|
||||
tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
|
||||
|
||||
|
||||
|
||||
tmp = -accum3;
|
||||
accum3 = tmp - accum2;
|
||||
accum2 = tmp;
|
||||
tmp = -accum1;
|
||||
accum1 = tmp - accum0;
|
||||
accum0 = tmp;
|
||||
|
||||
accum0 += accumC0;
|
||||
accum1 += accumC1;
|
||||
accum2 += accum0 >> 28;
|
||||
accum3 += accum1 >> 28;
|
||||
|
||||
|
||||
c[6] = ((uint32_t)(accum0)) & mask;
|
||||
c[7] = ((uint32_t)(accum2)) & mask;
|
||||
c[14] = ((uint32_t)(accum1)) & mask;
|
||||
c[15] = ((uint32_t)(accum3)) & mask;
|
||||
|
||||
|
||||
accum0 = accum2 >> 28;
|
||||
accum1 = accum3 >> 28;
|
||||
}
|
||||
@ -722,21 +719,18 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
accum1 += c[0];
|
||||
c[8] = ((uint32_t)(accum0)) & mask;
|
||||
c[0] = ((uint32_t)(accum1)) & mask;
|
||||
|
||||
|
||||
accum0 >>= 28;
|
||||
accum1 >>= 28;
|
||||
c[9] += ((uint32_t)(accum0));
|
||||
c[1] += ((uint32_t)(accum1));
|
||||
}
|
||||
|
||||
void gf_mulw_unsigned (
|
||||
gf_s *__restrict__ cs,
|
||||
const gf as,
|
||||
uint32_t b
|
||||
) {
|
||||
uint32_t mask = (1ull<<28)-1;
|
||||
void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
|
||||
{
|
||||
uint32_t mask = (1ull << 28) - 1;
|
||||
assert(b <= mask);
|
||||
|
||||
|
||||
const uint32_t *a = as->limb;
|
||||
uint32_t *c = cs->limb;
|
||||
|
||||
@ -745,75 +739,99 @@ void gf_mulw_unsigned (
|
||||
int i;
|
||||
|
||||
uint32_t c0, c8, n0, n8;
|
||||
c0 = a[0]; c8 = a[8];
|
||||
c0 = a[0];
|
||||
c8 = a[8];
|
||||
accum0 = widemul(b, c0);
|
||||
accum8 = widemul(b, c8);
|
||||
|
||||
c[0] = accum0 & mask; accum0 >>= 28;
|
||||
c[8] = accum8 & mask; accum8 >>= 28;
|
||||
|
||||
i=1;
|
||||
c[0] = accum0 & mask;
|
||||
accum0 >>= 28;
|
||||
c[8] = accum8 & mask;
|
||||
accum8 >>= 28;
|
||||
|
||||
i = 1;
|
||||
{
|
||||
n0 = a[i]; n8 = a[i+8];
|
||||
n0 = a[i];
|
||||
n8 = a[i + 8];
|
||||
smlal(&accum0, b, n0);
|
||||
smlal(&accum8, b, n8);
|
||||
|
||||
c[i] = accum0 & mask; accum0 >>= 28;
|
||||
c[i+8] = accum8 & mask; accum8 >>= 28;
|
||||
|
||||
c[i] = accum0 & mask;
|
||||
accum0 >>= 28;
|
||||
c[i + 8] = accum8 & mask;
|
||||
accum8 >>= 28;
|
||||
i++;
|
||||
}
|
||||
{
|
||||
c0 = a[i]; c8 = a[i+8];
|
||||
c0 = a[i];
|
||||
c8 = a[i + 8];
|
||||
smlal(&accum0, b, c0);
|
||||
smlal(&accum8, b, c8);
|
||||
|
||||
c[i] = accum0 & mask; accum0 >>= 28;
|
||||
c[i+8] = accum8 & mask; accum8 >>= 28;
|
||||
c[i] = accum0 & mask;
|
||||
accum0 >>= 28;
|
||||
c[i + 8] = accum8 & mask;
|
||||
accum8 >>= 28;
|
||||
i++;
|
||||
}
|
||||
{
|
||||
n0 = a[i]; n8 = a[i+8];
|
||||
n0 = a[i];
|
||||
n8 = a[i + 8];
|
||||
smlal(&accum0, b, n0);
|
||||
smlal(&accum8, b, n8);
|
||||
|
||||
c[i] = accum0 & mask; accum0 >>= 28;
|
||||
c[i+8] = accum8 & mask; accum8 >>= 28;
|
||||
c[i] = accum0 & mask;
|
||||
accum0 >>= 28;
|
||||
c[i + 8] = accum8 & mask;
|
||||
accum8 >>= 28;
|
||||
i++;
|
||||
}
|
||||
{
|
||||
c0 = a[i]; c8 = a[i+8];
|
||||
c0 = a[i];
|
||||
c8 = a[i + 8];
|
||||
smlal(&accum0, b, c0);
|
||||
smlal(&accum8, b, c8);
|
||||
|
||||
c[i] = accum0 & mask; accum0 >>= 28;
|
||||
c[i+8] = accum8 & mask; accum8 >>= 28;
|
||||
c[i] = accum0 & mask;
|
||||
accum0 >>= 28;
|
||||
c[i + 8] = accum8 & mask;
|
||||
accum8 >>= 28;
|
||||
i++;
|
||||
}
|
||||
{
|
||||
n0 = a[i]; n8 = a[i+8];
|
||||
n0 = a[i];
|
||||
n8 = a[i + 8];
|
||||
smlal(&accum0, b, n0);
|
||||
smlal(&accum8, b, n8);
|
||||
|
||||
c[i] = accum0 & mask; accum0 >>= 28;
|
||||
c[i+8] = accum8 & mask; accum8 >>= 28;
|
||||
c[i] = accum0 & mask;
|
||||
accum0 >>= 28;
|
||||
c[i + 8] = accum8 & mask;
|
||||
accum8 >>= 28;
|
||||
i++;
|
||||
}
|
||||
{
|
||||
c0 = a[i]; c8 = a[i+8];
|
||||
c0 = a[i];
|
||||
c8 = a[i + 8];
|
||||
smlal(&accum0, b, c0);
|
||||
smlal(&accum8, b, c8);
|
||||
|
||||
c[i] = accum0 & mask; accum0 >>= 28;
|
||||
c[i+8] = accum8 & mask; accum8 >>= 28;
|
||||
|
||||
c[i] = accum0 & mask;
|
||||
accum0 >>= 28;
|
||||
c[i + 8] = accum8 & mask;
|
||||
accum8 >>= 28;
|
||||
i++;
|
||||
}
|
||||
{
|
||||
n0 = a[i]; n8 = a[i+8];
|
||||
n0 = a[i];
|
||||
n8 = a[i + 8];
|
||||
smlal(&accum0, b, n0);
|
||||
smlal(&accum8, b, n8);
|
||||
|
||||
c[i] = accum0 & mask; accum0 >>= 28;
|
||||
c[i+8] = accum8 & mask; accum8 >>= 28;
|
||||
c[i] = accum0 & mask;
|
||||
accum0 >>= 28;
|
||||
c[i + 8] = accum8 & mask;
|
||||
accum8 >>= 28;
|
||||
i++;
|
||||
}
|
||||
|
||||
|
@ -14,48 +14,52 @@
|
||||
#define LIMB(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
|
||||
#define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
|
||||
{{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}}
|
||||
|
||||
|
||||
#define LIMB_PLACE_VALUE(i) 28
|
||||
|
||||
void gf_add_RAW (gf out, const gf a, const gf b) {
|
||||
for (unsigned int i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
|
||||
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
|
||||
void gf_add_RAW(gf out, const gf a, const gf b)
|
||||
{
|
||||
for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
|
||||
((uint32xn_t *) out)[i] =
|
||||
((const uint32xn_t *)a)[i] + ((const uint32xn_t *)b)[i];
|
||||
}
|
||||
/*
|
||||
for (unsigned int i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
|
||||
out->limb[i] = a->limb[i] + b->limb[i];
|
||||
}
|
||||
*/
|
||||
* for (unsigned int i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
|
||||
* out->limb[i] = a->limb[i] + b->limb[i]; }
|
||||
*/
|
||||
}
|
||||
|
||||
void gf_sub_RAW (gf out, const gf a, const gf b) {
|
||||
for (unsigned int i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
|
||||
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
|
||||
void gf_sub_RAW(gf out, const gf a, const gf b)
|
||||
{
|
||||
for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
|
||||
((uint32xn_t *) out)[i] =
|
||||
((const uint32xn_t *)a)[i] - ((const uint32xn_t *)b)[i];
|
||||
}
|
||||
/*
|
||||
for (unsigned int i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
|
||||
out->limb[i] = a->limb[i] - b->limb[i];
|
||||
}
|
||||
*/
|
||||
* for (unsigned int i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
|
||||
* out->limb[i] = a->limb[i] - b->limb[i]; }
|
||||
*/
|
||||
}
|
||||
|
||||
void gf_bias (gf a, int amt) {
|
||||
uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
|
||||
uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
|
||||
uint32x4_t *aa = (uint32x4_t*) a;
|
||||
void gf_bias(gf a, int amt)
|
||||
{
|
||||
uint32_t co1 = ((1ull << 28) - 1) * amt, co2 = co1 - amt;
|
||||
uint32x4_t lo = { co1, co1, co1, co1 }, hi = {
|
||||
co2, co1, co1, co1};
|
||||
uint32x4_t *aa = (uint32x4_t *) a;
|
||||
aa[0] += lo;
|
||||
aa[1] += lo;
|
||||
aa[2] += hi;
|
||||
aa[3] += lo;
|
||||
}
|
||||
|
||||
void gf_weak_reduce (gf a) {
|
||||
uint64_t mask = (1ull<<28) - 1;
|
||||
void gf_weak_reduce(gf a)
|
||||
{
|
||||
uint64_t mask = (1ull << 28) - 1;
|
||||
uint64_t tmp = a->limb[15] >> 28;
|
||||
a->limb[8] += tmp;
|
||||
for (unsigned int i=15; i>0; i--) {
|
||||
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);
|
||||
for (unsigned int i = 15; i > 0; i--) {
|
||||
a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 28);
|
||||
}
|
||||
a->limb[0] = (a->limb[0] & mask) + tmp;
|
||||
}
|
||||
|
||||
|
@ -11,22 +11,26 @@
|
||||
*/
|
||||
|
||||
#ifndef __ARCH_NEON_ARCH_INTRINSICS_H__
|
||||
#define __ARCH_NEON_ARCH_INTRINSICS_H__
|
||||
# define __ARCH_NEON_ARCH_INTRINSICS_H__
|
||||
|
||||
#define ARCH_WORD_BITS 32
|
||||
# define ARCH_WORD_BITS 32
|
||||
|
||||
static __inline__ __attribute((always_inline,unused))
|
||||
uint32_t word_is_zero(uint32_t a) {
|
||||
static __inline__ __attribute((always_inline, unused))
|
||||
uint32_t word_is_zero(uint32_t a)
|
||||
{
|
||||
uint32_t ret;
|
||||
__asm__("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc");
|
||||
__asm__("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
|
||||
return ret;
|
||||
}
|
||||
|
||||
static __inline__ __attribute((always_inline,unused))
|
||||
uint64_t widemul(uint32_t a, uint32_t b) {
|
||||
/* Could be UMULL, but it's hard to express to CC that the registers must be different */
|
||||
return ((uint64_t)a) * b;
|
||||
static __inline__ __attribute((always_inline, unused))
|
||||
uint64_t widemul(uint32_t a, uint32_t b)
|
||||
{
|
||||
/*
|
||||
* Could be UMULL, but it's hard to express to CC that the registers must
|
||||
* be different
|
||||
*/
|
||||
return ((uint64_t)a) * b;
|
||||
}
|
||||
|
||||
#endif /* __ARCH_NEON_ARCH_INTRINSICS_H__ */
|
||||
|
||||
#endif /* __ARCH_NEON_ARCH_INTRINSICS_H__ */
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -15,50 +15,55 @@
|
||||
#define USE_NEON_PERM 1
|
||||
#define LIMBHI(x) ((x##ull)>>28)
|
||||
#define LIMBLO(x) ((x##ull)&((1ull<<28)-1))
|
||||
# define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
|
||||
#define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
|
||||
{{LIMBLO(a),LIMBLO(e), LIMBHI(a),LIMBHI(e), \
|
||||
LIMBLO(b),LIMBLO(f), LIMBHI(b),LIMBHI(f), \
|
||||
LIMBLO(c),LIMBLO(g), LIMBHI(c),LIMBHI(g), \
|
||||
LIMBLO(d),LIMBLO(h), LIMBHI(d),LIMBHI(h)}}
|
||||
|
||||
|
||||
#define LIMB_PLACE_VALUE(i) 28
|
||||
|
||||
void gf_add_RAW (gf out, const gf a, const gf b) {
|
||||
for (unsigned int i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
|
||||
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
|
||||
void gf_add_RAW(gf out, const gf a, const gf b)
|
||||
{
|
||||
for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
|
||||
((uint32xn_t *) out)[i] =
|
||||
((const uint32xn_t *)a)[i] + ((const uint32xn_t *)b)[i];
|
||||
}
|
||||
}
|
||||
|
||||
void gf_sub_RAW (gf out, const gf a, const gf b) {
|
||||
for (unsigned int i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
|
||||
((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
|
||||
void gf_sub_RAW(gf out, const gf a, const gf b)
|
||||
{
|
||||
for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
|
||||
((uint32xn_t *) out)[i] =
|
||||
((const uint32xn_t *)a)[i] - ((const uint32xn_t *)b)[i];
|
||||
}
|
||||
/*
|
||||
unsigned int i;
|
||||
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
|
||||
out->limb[i] = a->limb[i] - b->limb[i];
|
||||
}
|
||||
*/
|
||||
* unsigned int i; for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
|
||||
* out->limb[i] = a->limb[i] - b->limb[i]; }
|
||||
*/
|
||||
}
|
||||
|
||||
void gf_bias (gf a, int amt) {
|
||||
uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
|
||||
uint32x4_t lo = {co1,co2,co1,co1}, hi = {co1,co1,co1,co1};
|
||||
uint32x4_t *aa = (uint32x4_t*) a;
|
||||
void gf_bias(gf a, int amt)
|
||||
{
|
||||
uint32_t co1 = ((1ull << 28) - 1) * amt, co2 = co1 - amt;
|
||||
uint32x4_t lo = { co1, co2, co1, co1 }, hi = {
|
||||
co1, co1, co1, co1};
|
||||
uint32x4_t *aa = (uint32x4_t *) a;
|
||||
aa[0] += lo;
|
||||
aa[1] += hi;
|
||||
aa[2] += hi;
|
||||
aa[3] += hi;
|
||||
}
|
||||
|
||||
void gf_weak_reduce (gf a) {
|
||||
void gf_weak_reduce(gf a)
|
||||
{
|
||||
|
||||
uint32x2_t *aa = (uint32x2_t*) a, vmask = {(1ull<<28)-1, (1ull<<28)-1}, vm2 = {0,-1},
|
||||
tmp = vshr_n_u32(aa[7],28);
|
||||
|
||||
for (unsigned int i=7; i>=1; i--) {
|
||||
aa[i] = vsra_n_u32(aa[i] & vmask, aa[i-1], 28);
|
||||
uint32x2_t *aa = (uint32x2_t *) a, vmask = {
|
||||
(1ull << 28) - 1, (1ull << 28) - 1}, vm2 = {
|
||||
0, -1}, tmp = vshr_n_u32(aa[7], 28);
|
||||
|
||||
for (unsigned int i = 7; i >= 1; i--) {
|
||||
aa[i] = vsra_n_u32(aa[i] & vmask, aa[i - 1], 28);
|
||||
}
|
||||
aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp&vm2);
|
||||
aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp & vm2);
|
||||
}
|
||||
|
||||
|
@ -11,20 +11,21 @@
|
||||
*/
|
||||
|
||||
#ifndef __ARCH_REF64_ARCH_INTRINSICS_H__
|
||||
#define __ARCH_REF64_ARCH_INTRINSICS_H__
|
||||
# define __ARCH_REF64_ARCH_INTRINSICS_H__
|
||||
|
||||
#define ARCH_WORD_BITS 64
|
||||
# define ARCH_WORD_BITS 64
|
||||
|
||||
static __inline__ __attribute((always_inline,unused))
|
||||
uint64_t word_is_zero(uint64_t a) {
|
||||
static __inline__ __attribute((always_inline, unused))
|
||||
uint64_t word_is_zero(uint64_t a)
|
||||
{
|
||||
/* let's hope the compiler isn't clever enough to optimize this. */
|
||||
return (((__uint128_t)a)-1)>>64;
|
||||
return (((__uint128_t) a) - 1) >> 64;
|
||||
}
|
||||
|
||||
static __inline__ __attribute((always_inline,unused))
|
||||
__uint128_t widemul(uint64_t a, uint64_t b) {
|
||||
return ((__uint128_t)a) * b;
|
||||
static __inline__ __attribute((always_inline, unused))
|
||||
__uint128_t widemul(uint64_t a, uint64_t b)
|
||||
{
|
||||
return ((__uint128_t) a) * b;
|
||||
}
|
||||
|
||||
#endif /* ARCH_REF64_ARCH_INTRINSICS_H__ */
|
||||
|
||||
#endif /* ARCH_REF64_ARCH_INTRINSICS_H__ */
|
||||
|
@ -11,68 +11,69 @@
|
||||
*/
|
||||
#include "f_field.h"
|
||||
|
||||
void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
|
||||
{
|
||||
const uint64_t *a = as->limb, *b = bs->limb;
|
||||
uint64_t *c = cs->limb;
|
||||
|
||||
__uint128_t accum0 = 0, accum1 = 0, accum2;
|
||||
uint64_t mask = (1ull<<56) - 1;
|
||||
uint64_t mask = (1ull << 56) - 1;
|
||||
|
||||
uint64_t aa[4], bb[4], bbb[4];
|
||||
|
||||
unsigned int i;
|
||||
for (i=0; i<4; i++) {
|
||||
aa[i] = a[i] + a[i+4];
|
||||
bb[i] = b[i] + b[i+4];
|
||||
bbb[i] = bb[i] + b[i+4];
|
||||
for (i = 0; i < 4; i++) {
|
||||
aa[i] = a[i] + a[i + 4];
|
||||
bb[i] = b[i] + b[i + 4];
|
||||
bbb[i] = bb[i] + b[i + 4];
|
||||
}
|
||||
|
||||
int I_HATE_UNROLLED_LOOPS = 0;
|
||||
|
||||
if (I_HATE_UNROLLED_LOOPS) {
|
||||
/* The compiler probably won't unroll this,
|
||||
* so it's like 80% slower.
|
||||
/*
|
||||
* The compiler probably won't unroll this, so it's like 80% slower.
|
||||
*/
|
||||
for (i=0; i<4; i++) {
|
||||
for (i = 0; i < 4; i++) {
|
||||
accum2 = 0;
|
||||
|
||||
unsigned int j;
|
||||
for (j=0; j<=i; j++) {
|
||||
accum2 += widemul(a[j], b[i-j]);
|
||||
accum1 += widemul(aa[j], bb[i-j]);
|
||||
accum0 += widemul(a[j+4], b[i-j+4]);
|
||||
for (j = 0; j <= i; j++) {
|
||||
accum2 += widemul(a[j], b[i - j]);
|
||||
accum1 += widemul(aa[j], bb[i - j]);
|
||||
accum0 += widemul(a[j + 4], b[i - j + 4]);
|
||||
}
|
||||
for (; j<4; j++) {
|
||||
accum2 += widemul(a[j], b[i-j+8]);
|
||||
accum1 += widemul(aa[j], bbb[i-j+4]);
|
||||
accum0 += widemul(a[j+4], bb[i-j+4]);
|
||||
for (; j < 4; j++) {
|
||||
accum2 += widemul(a[j], b[i - j + 8]);
|
||||
accum1 += widemul(aa[j], bbb[i - j + 4]);
|
||||
accum0 += widemul(a[j + 4], bb[i - j + 4]);
|
||||
}
|
||||
|
||||
accum1 -= accum2;
|
||||
accum0 += accum2;
|
||||
|
||||
c[i] = ((uint64_t)(accum0)) & mask;
|
||||
c[i+4] = ((uint64_t)(accum1)) & mask;
|
||||
c[i] = ((uint64_t)(accum0)) & mask;
|
||||
c[i + 4] = ((uint64_t)(accum1)) & mask;
|
||||
|
||||
accum0 >>= 56;
|
||||
accum1 >>= 56;
|
||||
}
|
||||
} else {
|
||||
accum2 = widemul(a[0], b[0]);
|
||||
accum2 = widemul(a[0], b[0]);
|
||||
accum1 += widemul(aa[0], bb[0]);
|
||||
accum0 += widemul(a[4], b[4]);
|
||||
accum0 += widemul(a[4], b[4]);
|
||||
|
||||
accum2 += widemul(a[1], b[7]);
|
||||
accum2 += widemul(a[1], b[7]);
|
||||
accum1 += widemul(aa[1], bbb[3]);
|
||||
accum0 += widemul(a[5], bb[3]);
|
||||
accum0 += widemul(a[5], bb[3]);
|
||||
|
||||
accum2 += widemul(a[2], b[6]);
|
||||
accum2 += widemul(a[2], b[6]);
|
||||
accum1 += widemul(aa[2], bbb[2]);
|
||||
accum0 += widemul(a[6], bb[2]);
|
||||
accum0 += widemul(a[6], bb[2]);
|
||||
|
||||
accum2 += widemul(a[3], b[5]);
|
||||
accum2 += widemul(a[3], b[5]);
|
||||
accum1 += widemul(aa[3], bbb[1]);
|
||||
accum0 += widemul(a[7], bb[1]);
|
||||
accum0 += widemul(a[7], bb[1]);
|
||||
|
||||
accum1 -= accum2;
|
||||
accum0 += accum2;
|
||||
@ -83,21 +84,21 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
accum0 >>= 56;
|
||||
accum1 >>= 56;
|
||||
|
||||
accum2 = widemul(a[0], b[1]);
|
||||
accum2 = widemul(a[0], b[1]);
|
||||
accum1 += widemul(aa[0], bb[1]);
|
||||
accum0 += widemul(a[4], b[5]);
|
||||
accum0 += widemul(a[4], b[5]);
|
||||
|
||||
accum2 += widemul(a[1], b[0]);
|
||||
accum2 += widemul(a[1], b[0]);
|
||||
accum1 += widemul(aa[1], bb[0]);
|
||||
accum0 += widemul(a[5], b[4]);
|
||||
accum0 += widemul(a[5], b[4]);
|
||||
|
||||
accum2 += widemul(a[2], b[7]);
|
||||
accum2 += widemul(a[2], b[7]);
|
||||
accum1 += widemul(aa[2], bbb[3]);
|
||||
accum0 += widemul(a[6], bb[3]);
|
||||
accum0 += widemul(a[6], bb[3]);
|
||||
|
||||
accum2 += widemul(a[3], b[6]);
|
||||
accum2 += widemul(a[3], b[6]);
|
||||
accum1 += widemul(aa[3], bbb[2]);
|
||||
accum0 += widemul(a[7], bb[2]);
|
||||
accum0 += widemul(a[7], bb[2]);
|
||||
|
||||
accum1 -= accum2;
|
||||
accum0 += accum2;
|
||||
@ -108,21 +109,21 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
accum0 >>= 56;
|
||||
accum1 >>= 56;
|
||||
|
||||
accum2 = widemul(a[0], b[2]);
|
||||
accum2 = widemul(a[0], b[2]);
|
||||
accum1 += widemul(aa[0], bb[2]);
|
||||
accum0 += widemul(a[4], b[6]);
|
||||
accum0 += widemul(a[4], b[6]);
|
||||
|
||||
accum2 += widemul(a[1], b[1]);
|
||||
accum2 += widemul(a[1], b[1]);
|
||||
accum1 += widemul(aa[1], bb[1]);
|
||||
accum0 += widemul(a[5], b[5]);
|
||||
accum0 += widemul(a[5], b[5]);
|
||||
|
||||
accum2 += widemul(a[2], b[0]);
|
||||
accum2 += widemul(a[2], b[0]);
|
||||
accum1 += widemul(aa[2], bb[0]);
|
||||
accum0 += widemul(a[6], b[4]);
|
||||
accum0 += widemul(a[6], b[4]);
|
||||
|
||||
accum2 += widemul(a[3], b[7]);
|
||||
accum2 += widemul(a[3], b[7]);
|
||||
accum1 += widemul(aa[3], bbb[3]);
|
||||
accum0 += widemul(a[7], bb[3]);
|
||||
accum0 += widemul(a[7], bb[3]);
|
||||
|
||||
accum1 -= accum2;
|
||||
accum0 += accum2;
|
||||
@ -133,21 +134,21 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
accum0 >>= 56;
|
||||
accum1 >>= 56;
|
||||
|
||||
accum2 = widemul(a[0], b[3]);
|
||||
accum2 = widemul(a[0], b[3]);
|
||||
accum1 += widemul(aa[0], bb[3]);
|
||||
accum0 += widemul(a[4], b[7]);
|
||||
accum0 += widemul(a[4], b[7]);
|
||||
|
||||
accum2 += widemul(a[1], b[2]);
|
||||
accum2 += widemul(a[1], b[2]);
|
||||
accum1 += widemul(aa[1], bb[2]);
|
||||
accum0 += widemul(a[5], b[6]);
|
||||
accum0 += widemul(a[5], b[6]);
|
||||
|
||||
accum2 += widemul(a[2], b[1]);
|
||||
accum2 += widemul(a[2], b[1]);
|
||||
accum1 += widemul(aa[2], bb[1]);
|
||||
accum0 += widemul(a[6], b[5]);
|
||||
accum0 += widemul(a[6], b[5]);
|
||||
|
||||
accum2 += widemul(a[3], b[0]);
|
||||
accum2 += widemul(a[3], b[0]);
|
||||
accum1 += widemul(aa[3], bb[0]);
|
||||
accum0 += widemul(a[7], b[4]);
|
||||
accum0 += widemul(a[7], b[4]);
|
||||
|
||||
accum1 -= accum2;
|
||||
accum0 += accum2;
|
||||
@ -157,7 +158,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
|
||||
accum0 >>= 56;
|
||||
accum1 >>= 56;
|
||||
} /* !I_HATE_UNROLLED_LOOPS */
|
||||
} /* !I_HATE_UNROLLED_LOOPS */
|
||||
|
||||
accum0 += accum1;
|
||||
accum0 += c[4];
|
||||
@ -172,21 +173,24 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
c[1] += ((uint64_t)(accum1));
|
||||
}
|
||||
|
||||
void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
|
||||
void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
|
||||
{
|
||||
const uint64_t *a = as->limb;
|
||||
uint64_t *c = cs->limb;
|
||||
|
||||
__uint128_t accum0 = 0, accum4 = 0;
|
||||
uint64_t mask = (1ull<<56) - 1;
|
||||
uint64_t mask = (1ull << 56) - 1;
|
||||
|
||||
int i;
|
||||
for (i=0; i<4; i++) {
|
||||
for (i = 0; i < 4; i++) {
|
||||
accum0 += widemul(b, a[i]);
|
||||
accum4 += widemul(b, a[i+4]);
|
||||
c[i] = accum0 & mask; accum0 >>= 56;
|
||||
c[i+4] = accum4 & mask; accum4 >>= 56;
|
||||
accum4 += widemul(b, a[i + 4]);
|
||||
c[i] = accum0 & mask;
|
||||
accum0 >>= 56;
|
||||
c[i + 4] = accum4 & mask;
|
||||
accum4 >>= 56;
|
||||
}
|
||||
|
||||
|
||||
accum0 += accum4 + c[4];
|
||||
c[4] = accum0 & mask;
|
||||
c[5] += accum0 >> 56;
|
||||
@ -196,24 +200,25 @@ void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
|
||||
c[1] += accum4 >> 56;
|
||||
}
|
||||
|
||||
void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
void gf_sqr(gf_s * __restrict__ cs, const gf as)
|
||||
{
|
||||
const uint64_t *a = as->limb;
|
||||
uint64_t *c = cs->limb;
|
||||
|
||||
__uint128_t accum0 = 0, accum1 = 0, accum2;
|
||||
uint64_t mask = (1ull<<56) - 1;
|
||||
uint64_t mask = (1ull << 56) - 1;
|
||||
|
||||
uint64_t aa[4];
|
||||
|
||||
/* For some reason clang doesn't vectorize this without prompting? */
|
||||
unsigned int i;
|
||||
for (i=0; i<4; i++) {
|
||||
aa[i] = a[i] + a[i+4];
|
||||
for (i = 0; i < 4; i++) {
|
||||
aa[i] = a[i] + a[i + 4];
|
||||
}
|
||||
|
||||
accum2 = widemul(a[0],a[3]);
|
||||
accum0 = widemul(aa[0],aa[3]);
|
||||
accum1 = widemul(a[4],a[7]);
|
||||
accum2 = widemul(a[0], a[3]);
|
||||
accum0 = widemul(aa[0], aa[3]);
|
||||
accum1 = widemul(a[4], a[7]);
|
||||
|
||||
accum2 += widemul(a[1], a[2]);
|
||||
accum0 += widemul(aa[1], aa[2]);
|
||||
@ -222,21 +227,21 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
accum0 -= accum2;
|
||||
accum1 += accum2;
|
||||
|
||||
c[3] = ((uint64_t)(accum1))<<1 & mask;
|
||||
c[7] = ((uint64_t)(accum0))<<1 & mask;
|
||||
c[3] = ((uint64_t)(accum1)) << 1 & mask;
|
||||
c[7] = ((uint64_t)(accum0)) << 1 & mask;
|
||||
|
||||
accum0 >>= 55;
|
||||
accum1 >>= 55;
|
||||
|
||||
accum0 += widemul(2*aa[1],aa[3]);
|
||||
accum1 += widemul(2*a[5], a[7]);
|
||||
accum0 += widemul(2 * aa[1], aa[3]);
|
||||
accum1 += widemul(2 * a[5], a[7]);
|
||||
accum0 += widemul(aa[2], aa[2]);
|
||||
accum1 += accum0;
|
||||
|
||||
accum0 -= widemul(2*a[1], a[3]);
|
||||
accum0 -= widemul(2 * a[1], a[3]);
|
||||
accum1 += widemul(a[6], a[6]);
|
||||
|
||||
accum2 = widemul(a[0],a[0]);
|
||||
|
||||
accum2 = widemul(a[0], a[0]);
|
||||
accum1 -= accum2;
|
||||
accum0 += accum2;
|
||||
|
||||
@ -250,16 +255,16 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
accum0 >>= 56;
|
||||
accum1 >>= 56;
|
||||
|
||||
accum2 = widemul(2*aa[2],aa[3]);
|
||||
accum0 -= widemul(2*a[2], a[3]);
|
||||
accum1 += widemul(2*a[6], a[7]);
|
||||
accum2 = widemul(2 * aa[2], aa[3]);
|
||||
accum0 -= widemul(2 * a[2], a[3]);
|
||||
accum1 += widemul(2 * a[6], a[7]);
|
||||
|
||||
accum1 += accum2;
|
||||
accum0 += accum2;
|
||||
|
||||
accum2 = widemul(2*a[0],a[1]);
|
||||
accum1 += widemul(2*aa[0], aa[1]);
|
||||
accum0 += widemul(2*a[4], a[5]);
|
||||
accum2 = widemul(2 * a[0], a[1]);
|
||||
accum1 += widemul(2 * aa[0], aa[1]);
|
||||
accum0 += widemul(2 * a[4], a[5]);
|
||||
|
||||
accum1 -= accum2;
|
||||
accum0 += accum2;
|
||||
@ -270,16 +275,16 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
accum0 >>= 56;
|
||||
accum1 >>= 56;
|
||||
|
||||
accum2 = widemul(aa[3],aa[3]);
|
||||
accum2 = widemul(aa[3], aa[3]);
|
||||
accum0 -= widemul(a[3], a[3]);
|
||||
accum1 += widemul(a[7], a[7]);
|
||||
|
||||
accum1 += accum2;
|
||||
accum0 += accum2;
|
||||
|
||||
accum2 = widemul(2*a[0],a[2]);
|
||||
accum1 += widemul(2*aa[0], aa[2]);
|
||||
accum0 += widemul(2*a[4], a[6]);
|
||||
accum2 = widemul(2 * a[0], a[2]);
|
||||
accum1 += widemul(2 * aa[0], aa[2]);
|
||||
accum0 += widemul(2 * a[4], a[6]);
|
||||
|
||||
accum2 += widemul(a[1], a[1]);
|
||||
accum1 += widemul(aa[1], aa[1]);
|
||||
@ -306,4 +311,3 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
|
||||
c[0] += ((uint64_t)(accum1));
|
||||
}
|
||||
|
||||
|
@ -10,37 +10,41 @@
|
||||
* Originally written by Mike Hamburg
|
||||
*/
|
||||
|
||||
#define GF_HEADROOM 9999 /* Everything is reduced anyway */
|
||||
#define GF_HEADROOM 9999 /* Everything is reduced anyway */
|
||||
#define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
|
||||
|
||||
|
||||
#define LIMB_PLACE_VALUE(i) 56
|
||||
|
||||
void gf_add_RAW (gf out, const gf a, const gf b) {
|
||||
for (unsigned int i=0; i<8; i++) {
|
||||
void gf_add_RAW(gf out, const gf a, const gf b)
|
||||
{
|
||||
for (unsigned int i = 0; i < 8; i++) {
|
||||
out->limb[i] = a->limb[i] + b->limb[i];
|
||||
}
|
||||
gf_weak_reduce(out);
|
||||
}
|
||||
|
||||
void gf_sub_RAW (gf out, const gf a, const gf b) {
|
||||
uint64_t co1 = ((1ull<<56)-1)*2, co2 = co1-2;
|
||||
for (unsigned int i=0; i<8; i++) {
|
||||
out->limb[i] = a->limb[i] - b->limb[i] + ((i==4) ? co2 : co1);
|
||||
void gf_sub_RAW(gf out, const gf a, const gf b)
|
||||
{
|
||||
uint64_t co1 = ((1ull << 56) - 1) * 2, co2 = co1 - 2;
|
||||
for (unsigned int i = 0; i < 8; i++) {
|
||||
out->limb[i] = a->limb[i] - b->limb[i] + ((i == 4) ? co2 : co1);
|
||||
}
|
||||
gf_weak_reduce(out);
|
||||
}
|
||||
|
||||
void gf_bias (gf a, int amt) {
|
||||
(void) a;
|
||||
(void) amt;
|
||||
void gf_bias(gf a, int amt)
|
||||
{
|
||||
(void)a;
|
||||
(void)amt;
|
||||
}
|
||||
|
||||
void gf_weak_reduce (gf a) {
|
||||
uint64_t mask = (1ull<<56) - 1;
|
||||
void gf_weak_reduce(gf a)
|
||||
{
|
||||
uint64_t mask = (1ull << 56) - 1;
|
||||
uint64_t tmp = a->limb[7] >> 56;
|
||||
a->limb[4] += tmp;
|
||||
for (unsigned int i=7; i>0; i--) {
|
||||
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>56);
|
||||
for (unsigned int i = 7; i > 0; i--) {
|
||||
a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56);
|
||||
}
|
||||
a->limb[0] = (a->limb[0] & mask) + tmp;
|
||||
}
|
||||
|
@ -10,303 +10,292 @@
|
||||
* Originally written by Mike Hamburg
|
||||
*/
|
||||
#ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__
|
||||
#define __ARCH_X86_64_ARCH_INTRINSICS_H__
|
||||
# define __ARCH_X86_64_ARCH_INTRINSICS_H__
|
||||
|
||||
#define ARCH_WORD_BITS 64
|
||||
# define ARCH_WORD_BITS 64
|
||||
|
||||
#include <openssl/e_os2.h>
|
||||
# include <openssl/e_os2.h>
|
||||
|
||||
/* FUTURE: autogenerate */
|
||||
static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
|
||||
uint64_t c,d;
|
||||
#ifndef __BMI2__
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax;"
|
||||
"mulq %[b];"
|
||||
: [c]"=&a"(c), [d]"=d"(d)
|
||||
: [b]"m"(*b), [a]"m"(*a)
|
||||
: "cc");
|
||||
#else
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rdx;"
|
||||
"mulx %[b], %[c], %[d];"
|
||||
: [c]"=r"(c), [d]"=r"(d)
|
||||
: [b]"m"(*b), [a]"m"(*a)
|
||||
: "rdx");
|
||||
#endif
|
||||
return (((__uint128_t)(d))<<64) | c;
|
||||
static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b)
|
||||
{
|
||||
uint64_t c, d;
|
||||
# ifndef __BMI2__
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax;" "mulq %[b];":[c] "=&a"(c),[d] "=d"(d)
|
||||
:[b] "m"(*b),[a] "m"(*a)
|
||||
:"cc");
|
||||
# else
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rdx;" "mulx %[b], %[c], %[d];":[c] "=r"(c),[d] "=r"(d)
|
||||
:[b] "m"(*b),[a] "m"(*a)
|
||||
:"rdx");
|
||||
# endif
|
||||
return (((__uint128_t) (d)) << 64) | c;
|
||||
}
|
||||
|
||||
static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
|
||||
uint64_t c,d;
|
||||
#ifndef __BMI2__
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax;"
|
||||
"mulq %[b];"
|
||||
: [c]"=&a"(c), [d]"=d"(d)
|
||||
: [b]"m"(*b), [a]"r"(a)
|
||||
: "cc");
|
||||
#else
|
||||
__asm__ volatile
|
||||
("mulx %[b], %[c], %[d];"
|
||||
: [c]"=r"(c), [d]"=r"(d)
|
||||
: [b]"m"(*b), [a]"d"(a));
|
||||
#endif
|
||||
return (((__uint128_t)(d))<<64) | c;
|
||||
static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b)
|
||||
{
|
||||
uint64_t c, d;
|
||||
# ifndef __BMI2__
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax;" "mulq %[b];":[c] "=&a"(c),[d] "=d"(d)
|
||||
:[b] "m"(*b),[a] "r"(a)
|
||||
:"cc");
|
||||
# else
|
||||
__asm__ volatile
|
||||
("mulx %[b], %[c], %[d];":[c] "=r"(c),[d] "=r"(d)
|
||||
:[b] "m"(*b),[a] "d"(a));
|
||||
# endif
|
||||
return (((__uint128_t) (d)) << 64) | c;
|
||||
}
|
||||
|
||||
static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) {
|
||||
uint64_t c,d;
|
||||
#ifndef __BMI2__
|
||||
__asm__ volatile
|
||||
("mulq %[b];"
|
||||
: [c]"=a"(c), [d]"=d"(d)
|
||||
: [b]"r"(b), "a"(a)
|
||||
: "cc");
|
||||
#else
|
||||
__asm__ volatile
|
||||
("mulx %[b], %[c], %[d];"
|
||||
: [c]"=r"(c), [d]"=r"(d)
|
||||
: [b]"r"(b), [a]"d"(a));
|
||||
#endif
|
||||
return (((__uint128_t)(d))<<64) | c;
|
||||
static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b)
|
||||
{
|
||||
uint64_t c, d;
|
||||
# ifndef __BMI2__
|
||||
__asm__ volatile
|
||||
("mulq %[b];":[c] "=a"(c),[d] "=d"(d)
|
||||
:[b] "r"(b), "a"(a)
|
||||
:"cc");
|
||||
# else
|
||||
__asm__ volatile
|
||||
("mulx %[b], %[c], %[d];":[c] "=r"(c),[d] "=r"(d)
|
||||
:[b] "r"(b),[a] "d"(a));
|
||||
# endif
|
||||
return (((__uint128_t) (d)) << 64) | c;
|
||||
}
|
||||
|
||||
static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
|
||||
uint64_t c,d;
|
||||
#ifndef __BMI2__
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax; "
|
||||
"addq %%rax, %%rax; "
|
||||
"mulq %[b];"
|
||||
: [c]"=&a"(c), [d]"=d"(d)
|
||||
: [b]"m"(*b), [a]"m"(*a)
|
||||
: "cc");
|
||||
#else
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rdx;"
|
||||
"leaq (,%%rdx,2), %%rdx;"
|
||||
"mulx %[b], %[c], %[d];"
|
||||
: [c]"=r"(c), [d]"=r"(d)
|
||||
: [b]"m"(*b), [a]"m"(*a)
|
||||
: "rdx");
|
||||
#endif
|
||||
return (((__uint128_t)(d))<<64) | c;
|
||||
static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b)
|
||||
{
|
||||
uint64_t c, d;
|
||||
# ifndef __BMI2__
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax; "
|
||||
"addq %%rax, %%rax; " "mulq %[b];":[c] "=&a"(c),[d] "=d"(d)
|
||||
:[b] "m"(*b),[a] "m"(*a)
|
||||
:"cc");
|
||||
# else
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rdx;"
|
||||
"leaq (,%%rdx,2), %%rdx;" "mulx %[b], %[c], %[d];":[c] "=r"(c),[d] "=r"(d)
|
||||
:[b] "m"(*b),[a] "m"(*a)
|
||||
:"rdx");
|
||||
# endif
|
||||
return (((__uint128_t) (d)) << 64) | c;
|
||||
}
|
||||
|
||||
static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
|
||||
uint64_t lo = *acc, hi = *acc>>64;
|
||||
|
||||
#ifdef __BMI2__
|
||||
uint64_t c,d;
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rdx; "
|
||||
"mulx %[b], %[c], %[d]; "
|
||||
"addq %[c], %[lo]; "
|
||||
"adcq %[d], %[hi]; "
|
||||
: [c]"=&r"(c), [d]"=&r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
|
||||
: [b]"m"(*b), [a]"m"(*a)
|
||||
: "rdx", "cc");
|
||||
#else
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax; "
|
||||
"mulq %[b]; "
|
||||
"addq %%rax, %[lo]; "
|
||||
"adcq %%rdx, %[hi]; "
|
||||
: [lo]"+r"(lo), [hi]"+r"(hi)
|
||||
: [b]"m"(*b), [a]"m"(*a)
|
||||
: "rax", "rdx", "cc");
|
||||
#endif
|
||||
|
||||
*acc = (((__uint128_t)(hi))<<64) | lo;
|
||||
static __inline__ void mac(__uint128_t * acc, const uint64_t *a,
|
||||
const uint64_t *b)
|
||||
{
|
||||
uint64_t lo = *acc, hi = *acc >> 64;
|
||||
|
||||
# ifdef __BMI2__
|
||||
uint64_t c, d;
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rdx; "
|
||||
"mulx %[b], %[c], %[d]; "
|
||||
"addq %[c], %[lo]; "
|
||||
"adcq %[d], %[hi]; ":[c] "=&r"(c),[d] "=&r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
|
||||
:[b] "m"(*b),[a] "m"(*a)
|
||||
:"rdx", "cc");
|
||||
# else
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax; "
|
||||
"mulq %[b]; "
|
||||
"addq %%rax, %[lo]; " "adcq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi)
|
||||
:[b] "m"(*b),[a] "m"(*a)
|
||||
:"rax", "rdx", "cc");
|
||||
# endif
|
||||
|
||||
*acc = (((__uint128_t) (hi)) << 64) | lo;
|
||||
}
|
||||
|
||||
static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
|
||||
uint64_t lo = *acc, hi = *acc>>64;
|
||||
uint64_t lo2 = *acc2, hi2 = *acc2>>64;
|
||||
|
||||
#ifdef __BMI2__
|
||||
uint64_t c,d;
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rdx; "
|
||||
"mulx %[b], %[c], %[d]; "
|
||||
"addq %[c], %[lo]; "
|
||||
"adcq %[d], %[hi]; "
|
||||
"addq %[c], %[lo2]; "
|
||||
"adcq %[d], %[hi2]; "
|
||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
|
||||
: [b]"m"(*b), [a]"m"(*a)
|
||||
: "rdx", "cc");
|
||||
#else
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax; "
|
||||
"mulq %[b]; "
|
||||
"addq %%rax, %[lo]; "
|
||||
"adcq %%rdx, %[hi]; "
|
||||
"addq %%rax, %[lo2]; "
|
||||
"adcq %%rdx, %[hi2]; "
|
||||
: [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
|
||||
: [b]"m"(*b), [a]"m"(*a)
|
||||
: "rax", "rdx", "cc");
|
||||
#endif
|
||||
|
||||
*acc = (((__uint128_t)(hi))<<64) | lo;
|
||||
*acc2 = (((__uint128_t)(hi2))<<64) | lo2;
|
||||
static __inline__ void macac(__uint128_t * acc, __uint128_t * acc2,
|
||||
const uint64_t *a, const uint64_t *b)
|
||||
{
|
||||
uint64_t lo = *acc, hi = *acc >> 64;
|
||||
uint64_t lo2 = *acc2, hi2 = *acc2 >> 64;
|
||||
|
||||
# ifdef __BMI2__
|
||||
uint64_t c, d;
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rdx; "
|
||||
"mulx %[b], %[c], %[d]; "
|
||||
"addq %[c], %[lo]; "
|
||||
"adcq %[d], %[hi]; "
|
||||
"addq %[c], %[lo2]; "
|
||||
"adcq %[d], %[hi2]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi),
|
||||
[lo2] "+r"(lo2),[hi2] "+r"(hi2)
|
||||
:[b] "m"(*b),[a] "m"(*a)
|
||||
:"rdx", "cc");
|
||||
# else
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax; "
|
||||
"mulq %[b]; "
|
||||
"addq %%rax, %[lo]; "
|
||||
"adcq %%rdx, %[hi]; "
|
||||
"addq %%rax, %[lo2]; "
|
||||
"adcq %%rdx, %[hi2]; ":[lo] "+r"(lo),[hi] "+r"(hi),[lo2] "+r"(lo2),
|
||||
[hi2] "+r"(hi2)
|
||||
:[b] "m"(*b),[a] "m"(*a)
|
||||
:"rax", "rdx", "cc");
|
||||
# endif
|
||||
|
||||
*acc = (((__uint128_t) (hi)) << 64) | lo;
|
||||
*acc2 = (((__uint128_t) (hi2)) << 64) | lo2;
|
||||
}
|
||||
|
||||
static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
|
||||
uint64_t lo = *acc, hi = *acc>>64;
|
||||
|
||||
#ifdef __BMI2__
|
||||
uint64_t c,d;
|
||||
__asm__ volatile
|
||||
("mulx %[b], %[c], %[d]; "
|
||||
"addq %[c], %[lo]; "
|
||||
"adcq %[d], %[hi]; "
|
||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
|
||||
: [b]"m"(*b), [a]"d"(a)
|
||||
: "cc");
|
||||
#else
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax; "
|
||||
"mulq %[b]; "
|
||||
"addq %%rax, %[lo]; "
|
||||
"adcq %%rdx, %[hi]; "
|
||||
: [lo]"+r"(lo), [hi]"+r"(hi)
|
||||
: [b]"m"(*b), [a]"r"(a)
|
||||
: "rax", "rdx", "cc");
|
||||
#endif
|
||||
|
||||
*acc = (((__uint128_t)(hi))<<64) | lo;
|
||||
static __inline__ void mac_rm(__uint128_t * acc, uint64_t a, const uint64_t *b)
|
||||
{
|
||||
uint64_t lo = *acc, hi = *acc >> 64;
|
||||
|
||||
# ifdef __BMI2__
|
||||
uint64_t c, d;
|
||||
__asm__ volatile
|
||||
("mulx %[b], %[c], %[d]; "
|
||||
"addq %[c], %[lo]; "
|
||||
"adcq %[d], %[hi]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
|
||||
:[b] "m"(*b),[a] "d"(a)
|
||||
:"cc");
|
||||
# else
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax; "
|
||||
"mulq %[b]; "
|
||||
"addq %%rax, %[lo]; " "adcq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi)
|
||||
:[b] "m"(*b),[a] "r"(a)
|
||||
:"rax", "rdx", "cc");
|
||||
# endif
|
||||
|
||||
*acc = (((__uint128_t) (hi)) << 64) | lo;
|
||||
}
|
||||
|
||||
static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) {
|
||||
uint64_t lo = *acc, hi = *acc>>64;
|
||||
|
||||
#ifdef __BMI2__
|
||||
uint64_t c,d;
|
||||
__asm__ volatile
|
||||
("mulx %[b], %[c], %[d]; "
|
||||
"addq %[c], %[lo]; "
|
||||
"adcq %[d], %[hi]; "
|
||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
|
||||
: [b]"r"(b), [a]"d"(a)
|
||||
: "cc");
|
||||
#else
|
||||
__asm__ volatile
|
||||
("mulq %[b]; "
|
||||
"addq %%rax, %[lo]; "
|
||||
"adcq %%rdx, %[hi]; "
|
||||
: [lo]"+r"(lo), [hi]"+r"(hi), "+a"(a)
|
||||
: [b]"r"(b)
|
||||
: "rdx", "cc");
|
||||
#endif
|
||||
|
||||
*acc = (((__uint128_t)(hi))<<64) | lo;
|
||||
static __inline__ void mac_rr(__uint128_t * acc, uint64_t a, const uint64_t b)
|
||||
{
|
||||
uint64_t lo = *acc, hi = *acc >> 64;
|
||||
|
||||
# ifdef __BMI2__
|
||||
uint64_t c, d;
|
||||
__asm__ volatile
|
||||
("mulx %[b], %[c], %[d]; "
|
||||
"addq %[c], %[lo]; "
|
||||
"adcq %[d], %[hi]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
|
||||
:[b] "r"(b),[a] "d"(a)
|
||||
:"cc");
|
||||
# else
|
||||
__asm__ volatile
|
||||
("mulq %[b]; "
|
||||
"addq %%rax, %[lo]; "
|
||||
"adcq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi), "+a"(a)
|
||||
:[b] "r"(b)
|
||||
:"rdx", "cc");
|
||||
# endif
|
||||
|
||||
*acc = (((__uint128_t) (hi)) << 64) | lo;
|
||||
}
|
||||
|
||||
static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
|
||||
uint64_t lo = *acc, hi = *acc>>64;
|
||||
|
||||
#ifdef __BMI2__
|
||||
uint64_t c,d;
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rdx; "
|
||||
"addq %%rdx, %%rdx; "
|
||||
"mulx %[b], %[c], %[d]; "
|
||||
"addq %[c], %[lo]; "
|
||||
"adcq %[d], %[hi]; "
|
||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
|
||||
: [b]"m"(*b), [a]"m"(*a)
|
||||
: "rdx", "cc");
|
||||
#else
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax; "
|
||||
"addq %%rax, %%rax; "
|
||||
"mulq %[b]; "
|
||||
"addq %%rax, %[lo]; "
|
||||
"adcq %%rdx, %[hi]; "
|
||||
: [lo]"+r"(lo), [hi]"+r"(hi)
|
||||
: [b]"m"(*b), [a]"m"(*a)
|
||||
: "rax", "rdx", "cc");
|
||||
#endif
|
||||
|
||||
*acc = (((__uint128_t)(hi))<<64) | lo;
|
||||
static __inline__ void mac2(__uint128_t * acc, const uint64_t *a,
|
||||
const uint64_t *b)
|
||||
{
|
||||
uint64_t lo = *acc, hi = *acc >> 64;
|
||||
|
||||
# ifdef __BMI2__
|
||||
uint64_t c, d;
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rdx; "
|
||||
"addq %%rdx, %%rdx; "
|
||||
"mulx %[b], %[c], %[d]; "
|
||||
"addq %[c], %[lo]; "
|
||||
"adcq %[d], %[hi]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
|
||||
:[b] "m"(*b),[a] "m"(*a)
|
||||
:"rdx", "cc");
|
||||
# else
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax; "
|
||||
"addq %%rax, %%rax; "
|
||||
"mulq %[b]; "
|
||||
"addq %%rax, %[lo]; " "adcq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi)
|
||||
:[b] "m"(*b),[a] "m"(*a)
|
||||
:"rax", "rdx", "cc");
|
||||
# endif
|
||||
|
||||
*acc = (((__uint128_t) (hi)) << 64) | lo;
|
||||
}
|
||||
|
||||
static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
|
||||
uint64_t lo = *acc, hi = *acc>>64;
|
||||
#ifdef __BMI2__
|
||||
uint64_t c,d;
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rdx; "
|
||||
"mulx %[b], %[c], %[d]; "
|
||||
"subq %[c], %[lo]; "
|
||||
"sbbq %[d], %[hi]; "
|
||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
|
||||
: [b]"m"(*b), [a]"m"(*a)
|
||||
: "rdx", "cc");
|
||||
#else
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax; "
|
||||
"mulq %[b]; "
|
||||
"subq %%rax, %[lo]; "
|
||||
"sbbq %%rdx, %[hi]; "
|
||||
: [lo]"+r"(lo), [hi]"+r"(hi)
|
||||
: [b]"m"(*b), [a]"m"(*a)
|
||||
: "rax", "rdx", "cc");
|
||||
#endif
|
||||
*acc = (((__uint128_t)(hi))<<64) | lo;
|
||||
static __inline__ void msb(__uint128_t * acc, const uint64_t *a,
|
||||
const uint64_t *b)
|
||||
{
|
||||
uint64_t lo = *acc, hi = *acc >> 64;
|
||||
# ifdef __BMI2__
|
||||
uint64_t c, d;
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rdx; "
|
||||
"mulx %[b], %[c], %[d]; "
|
||||
"subq %[c], %[lo]; "
|
||||
"sbbq %[d], %[hi]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
|
||||
:[b] "m"(*b),[a] "m"(*a)
|
||||
:"rdx", "cc");
|
||||
# else
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax; "
|
||||
"mulq %[b]; "
|
||||
"subq %%rax, %[lo]; " "sbbq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi)
|
||||
:[b] "m"(*b),[a] "m"(*a)
|
||||
:"rax", "rdx", "cc");
|
||||
# endif
|
||||
*acc = (((__uint128_t) (hi)) << 64) | lo;
|
||||
}
|
||||
|
||||
static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
|
||||
uint64_t lo = *acc, hi = *acc>>64;
|
||||
#ifdef __BMI2__
|
||||
uint64_t c,d;
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rdx; "
|
||||
"addq %%rdx, %%rdx; "
|
||||
"mulx %[b], %[c], %[d]; "
|
||||
"subq %[c], %[lo]; "
|
||||
"sbbq %[d], %[hi]; "
|
||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
|
||||
: [b]"m"(*b), [a]"m"(*a)
|
||||
: "rdx", "cc");
|
||||
#else
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax; "
|
||||
"addq %%rax, %%rax; "
|
||||
"mulq %[b]; "
|
||||
"subq %%rax, %[lo]; "
|
||||
"sbbq %%rdx, %[hi]; "
|
||||
: [lo]"+r"(lo), [hi]"+r"(hi)
|
||||
: [b]"m"(*b), [a]"m"(*a)
|
||||
: "rax", "rdx", "cc");
|
||||
#endif
|
||||
*acc = (((__uint128_t)(hi))<<64) | lo;
|
||||
|
||||
static __inline__ void msb2(__uint128_t * acc, const uint64_t *a,
|
||||
const uint64_t *b)
|
||||
{
|
||||
uint64_t lo = *acc, hi = *acc >> 64;
|
||||
# ifdef __BMI2__
|
||||
uint64_t c, d;
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rdx; "
|
||||
"addq %%rdx, %%rdx; "
|
||||
"mulx %[b], %[c], %[d]; "
|
||||
"subq %[c], %[lo]; "
|
||||
"sbbq %[d], %[hi]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
|
||||
:[b] "m"(*b),[a] "m"(*a)
|
||||
:"rdx", "cc");
|
||||
# else
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rax; "
|
||||
"addq %%rax, %%rax; "
|
||||
"mulq %[b]; "
|
||||
"subq %%rax, %[lo]; " "sbbq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi)
|
||||
:[b] "m"(*b),[a] "m"(*a)
|
||||
:"rax", "rdx", "cc");
|
||||
# endif
|
||||
*acc = (((__uint128_t) (hi)) << 64) | lo;
|
||||
|
||||
}
|
||||
|
||||
static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
|
||||
uint64_t c,d, lo = *acc, hi = *acc>>64;
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rdx; "
|
||||
"mulx %[b], %[c], %[d]; "
|
||||
"subq %[lo], %[c]; "
|
||||
"sbbq %[hi], %[d]; "
|
||||
: [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
|
||||
: [b]"m"(*b), [a]"m"(*a)
|
||||
: "rdx", "cc");
|
||||
*acc = (((__uint128_t)(d))<<64) | c;
|
||||
static __inline__ void mrs(__uint128_t * acc, const uint64_t *a,
|
||||
const uint64_t *b)
|
||||
{
|
||||
uint64_t c, d, lo = *acc, hi = *acc >> 64;
|
||||
__asm__ volatile
|
||||
("movq %[a], %%rdx; "
|
||||
"mulx %[b], %[c], %[d]; "
|
||||
"subq %[lo], %[c]; "
|
||||
"sbbq %[hi], %[d]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
|
||||
:[b] "m"(*b),[a] "m"(*a)
|
||||
:"rdx", "cc");
|
||||
*acc = (((__uint128_t) (d)) << 64) | c;
|
||||
}
|
||||
|
||||
static __inline__ uint64_t word_is_zero(uint64_t x) {
|
||||
__asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
|
||||
return ~x;
|
||||
static __inline__ uint64_t word_is_zero(uint64_t x)
|
||||
{
|
||||
__asm__ volatile ("neg %0; sbb %0, %0;":"+r" (x));
|
||||
return ~x;
|
||||
}
|
||||
|
||||
static inline uint64_t shrld(__uint128_t x, int n) {
|
||||
return x>>n;
|
||||
static inline uint64_t shrld(__uint128_t x, int n)
|
||||
{
|
||||
return x >> n;
|
||||
}
|
||||
|
||||
#endif /* __ARCH_X86_64_ARCH_INTRINSICS_H__ */
|
||||
#endif /* __ARCH_X86_64_ARCH_INTRINSICS_H__ */
|
||||
|
@ -12,32 +12,34 @@
|
||||
|
||||
#include "f_field.h"
|
||||
|
||||
void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
|
||||
{
|
||||
const uint64_t *a = as->limb, *b = bs->limb;
|
||||
uint64_t *c = cs->limb;
|
||||
|
||||
__uint128_t accum0 = 0, accum1 = 0, accum2;
|
||||
uint64_t mask = (1ull<<56) - 1;
|
||||
uint64_t mask = (1ull << 56) - 1;
|
||||
|
||||
uint64_t aa[4] VECTOR_ALIGNED, bb[4] VECTOR_ALIGNED, bbb[4] VECTOR_ALIGNED;
|
||||
|
||||
/* For some reason clang doesn't vectorize this without prompting? */
|
||||
unsigned int i;
|
||||
for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
|
||||
((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
|
||||
((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i];
|
||||
((uint64xn_t*)bbb)[i] = ((const uint64xn_t*)bb)[i] + ((const uint64xn_t*)(&b[4]))[i];
|
||||
for (i = 0; i < sizeof(aa) / sizeof(uint64xn_t); i++) {
|
||||
((uint64xn_t *) aa)[i] =
|
||||
((const uint64xn_t *)a)[i] + ((const uint64xn_t *)(&a[4]))[i];
|
||||
((uint64xn_t *) bb)[i] =
|
||||
((const uint64xn_t *)b)[i] + ((const uint64xn_t *)(&b[4]))[i];
|
||||
((uint64xn_t *) bbb)[i] =
|
||||
((const uint64xn_t *)bb)[i] + ((const uint64xn_t *)(&b[4]))[i];
|
||||
}
|
||||
/*
|
||||
for (int i=0; i<4; i++) {
|
||||
aa[i] = a[i] + a[i+4];
|
||||
bb[i] = b[i] + b[i+4];
|
||||
}
|
||||
*/
|
||||
* for (int i=0; i<4; i++) { aa[i] = a[i] + a[i+4]; bb[i] = b[i] + b[i+4];
|
||||
* }
|
||||
*/
|
||||
|
||||
accum2 = widemul(&a[0],&b[3]);
|
||||
accum0 = widemul(&aa[0],&bb[3]);
|
||||
accum1 = widemul(&a[4],&b[7]);
|
||||
accum2 = widemul(&a[0], &b[3]);
|
||||
accum0 = widemul(&aa[0], &bb[3]);
|
||||
accum1 = widemul(&a[4], &b[7]);
|
||||
|
||||
mac(&accum2, &a[1], &b[2]);
|
||||
mac(&accum0, &aa[1], &bb[2]);
|
||||
@ -59,18 +61,18 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
|
||||
accum0 >>= 56;
|
||||
accum1 >>= 56;
|
||||
|
||||
mac(&accum0, &aa[1],&bb[3]);
|
||||
|
||||
mac(&accum0, &aa[1], &bb[3]);
|
||||
mac(&accum1, &a[5], &b[7]);
|
||||
mac(&accum0, &aa[2], &bb[2]);
|
||||
mac(&accum1, &a[6], &b[6]);
|
||||
mac(&accum0, &aa[3], &bb[1]);
|
||||
accum1 += accum0;
|
||||
|
||||
accum2 = widemul(&a[0],&b[0]);
|
||||
accum2 = widemul(&a[0], &b[0]);
|
||||
accum1 -= accum2;
|
||||
accum0 += accum2;
|
||||
|
||||
|
||||
msb(&accum0, &a[1], &b[3]);
|
||||
msb(&accum0, &a[2], &b[2]);
|
||||
mac(&accum1, &a[7], &b[5]);
|
||||
@ -84,7 +86,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
accum0 >>= 56;
|
||||
accum1 >>= 56;
|
||||
|
||||
accum2 = widemul(&a[2],&b[7]);
|
||||
accum2 = widemul(&a[2], &b[7]);
|
||||
mac(&accum0, &a[6], &bb[3]);
|
||||
mac(&accum1, &aa[2], &bbb[3]);
|
||||
|
||||
@ -92,7 +94,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
mac(&accum0, &a[7], &bb[2]);
|
||||
mac(&accum1, &aa[3], &bbb[2]);
|
||||
|
||||
mac(&accum2, &a[0],&b[1]);
|
||||
mac(&accum2, &a[0], &b[1]);
|
||||
mac(&accum1, &aa[0], &bb[1]);
|
||||
mac(&accum0, &a[4], &b[5]);
|
||||
|
||||
@ -109,11 +111,11 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
accum0 >>= 56;
|
||||
accum1 >>= 56;
|
||||
|
||||
accum2 = widemul(&a[3],&b[7]);
|
||||
accum2 = widemul(&a[3], &b[7]);
|
||||
mac(&accum0, &a[7], &bb[3]);
|
||||
mac(&accum1, &aa[3], &bbb[3]);
|
||||
|
||||
mac(&accum2, &a[0],&b[2]);
|
||||
mac(&accum2, &a[0], &b[2]);
|
||||
mac(&accum1, &aa[0], &bb[2]);
|
||||
mac(&accum0, &a[4], &b[6]);
|
||||
|
||||
@ -147,37 +149,46 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
|
||||
c[0] += ((uint64_t)(accum1));
|
||||
}
|
||||
|
||||
void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
|
||||
void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
|
||||
{
|
||||
const uint64_t *a = as->limb;
|
||||
uint64_t *c = cs->limb;
|
||||
|
||||
__uint128_t accum0, accum4;
|
||||
uint64_t mask = (1ull<<56) - 1;
|
||||
uint64_t mask = (1ull << 56) - 1;
|
||||
|
||||
accum0 = widemul_rm(b, &a[0]);
|
||||
accum4 = widemul_rm(b, &a[4]);
|
||||
|
||||
c[0] = accum0 & mask; accum0 >>= 56;
|
||||
c[4] = accum4 & mask; accum4 >>= 56;
|
||||
c[0] = accum0 & mask;
|
||||
accum0 >>= 56;
|
||||
c[4] = accum4 & mask;
|
||||
accum4 >>= 56;
|
||||
|
||||
mac_rm(&accum0, b, &a[1]);
|
||||
mac_rm(&accum4, b, &a[5]);
|
||||
|
||||
c[1] = accum0 & mask; accum0 >>= 56;
|
||||
c[5] = accum4 & mask; accum4 >>= 56;
|
||||
c[1] = accum0 & mask;
|
||||
accum0 >>= 56;
|
||||
c[5] = accum4 & mask;
|
||||
accum4 >>= 56;
|
||||
|
||||
mac_rm(&accum0, b, &a[2]);
|
||||
mac_rm(&accum4, b, &a[6]);
|
||||
|
||||
c[2] = accum0 & mask; accum0 >>= 56;
|
||||
c[6] = accum4 & mask; accum4 >>= 56;
|
||||
c[2] = accum0 & mask;
|
||||
accum0 >>= 56;
|
||||
c[6] = accum4 & mask;
|
||||
accum4 >>= 56;
|
||||
|
||||
mac_rm(&accum0, b, &a[3]);
|
||||
mac_rm(&accum4, b, &a[7]);
|
||||
|
||||
c[3] = accum0 & mask; accum0 >>= 56;
|
||||
c[7] = accum4 & mask; accum4 >>= 56;
|
||||
|
||||
c[3] = accum0 & mask;
|
||||
accum0 >>= 56;
|
||||
c[7] = accum4 & mask;
|
||||
accum4 >>= 56;
|
||||
|
||||
accum0 += accum4 + c[4];
|
||||
c[4] = accum0 & mask;
|
||||
c[5] += accum0 >> 56;
|
||||
@ -187,24 +198,26 @@ void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
|
||||
c[1] += accum4 >> 56;
|
||||
}
|
||||
|
||||
void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
void gf_sqr(gf_s * __restrict__ cs, const gf as)
|
||||
{
|
||||
const uint64_t *a = as->limb;
|
||||
uint64_t *c = cs->limb;
|
||||
|
||||
__uint128_t accum0 = 0, accum1 = 0, accum2;
|
||||
uint64_t mask = (1ull<<56) - 1;
|
||||
uint64_t mask = (1ull << 56) - 1;
|
||||
|
||||
uint64_t aa[4] VECTOR_ALIGNED;
|
||||
|
||||
/* For some reason clang doesn't vectorize this without prompting? */
|
||||
unsigned int i;
|
||||
for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
|
||||
((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
|
||||
for (i = 0; i < sizeof(aa) / sizeof(uint64xn_t); i++) {
|
||||
((uint64xn_t *) aa)[i] =
|
||||
((const uint64xn_t *)a)[i] + ((const uint64xn_t *)(&a[4]))[i];
|
||||
}
|
||||
|
||||
accum2 = widemul(&a[0],&a[3]);
|
||||
accum0 = widemul(&aa[0],&aa[3]);
|
||||
accum1 = widemul(&a[4],&a[7]);
|
||||
accum2 = widemul(&a[0], &a[3]);
|
||||
accum0 = widemul(&aa[0], &aa[3]);
|
||||
accum1 = widemul(&a[4], &a[7]);
|
||||
|
||||
mac(&accum2, &a[1], &a[2]);
|
||||
mac(&accum0, &aa[1], &aa[2]);
|
||||
@ -213,21 +226,21 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
accum0 -= accum2;
|
||||
accum1 += accum2;
|
||||
|
||||
c[3] = ((uint64_t)(accum1))<<1 & mask;
|
||||
c[7] = ((uint64_t)(accum0))<<1 & mask;
|
||||
c[3] = ((uint64_t)(accum1)) << 1 & mask;
|
||||
c[7] = ((uint64_t)(accum0)) << 1 & mask;
|
||||
|
||||
accum0 >>= 55;
|
||||
accum1 >>= 55;
|
||||
|
||||
mac2(&accum0, &aa[1],&aa[3]);
|
||||
mac2(&accum0, &aa[1], &aa[3]);
|
||||
mac2(&accum1, &a[5], &a[7]);
|
||||
mac(&accum0, &aa[2], &aa[2]);
|
||||
accum1 += accum0;
|
||||
|
||||
msb2(&accum0, &a[1], &a[3]);
|
||||
mac(&accum1, &a[6], &a[6]);
|
||||
|
||||
accum2 = widemul(&a[0],&a[0]);
|
||||
|
||||
accum2 = widemul(&a[0], &a[0]);
|
||||
accum1 -= accum2;
|
||||
accum0 += accum2;
|
||||
|
||||
@ -241,14 +254,14 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
accum0 >>= 56;
|
||||
accum1 >>= 56;
|
||||
|
||||
accum2 = widemul2(&aa[2],&aa[3]);
|
||||
accum2 = widemul2(&aa[2], &aa[3]);
|
||||
msb2(&accum0, &a[2], &a[3]);
|
||||
mac2(&accum1, &a[6], &a[7]);
|
||||
|
||||
accum1 += accum2;
|
||||
accum0 += accum2;
|
||||
|
||||
accum2 = widemul2(&a[0],&a[1]);
|
||||
accum2 = widemul2(&a[0], &a[1]);
|
||||
mac2(&accum1, &aa[0], &aa[1]);
|
||||
mac2(&accum0, &a[4], &a[5]);
|
||||
|
||||
@ -261,14 +274,14 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
|
||||
accum0 >>= 56;
|
||||
accum1 >>= 56;
|
||||
|
||||
accum2 = widemul(&aa[3],&aa[3]);
|
||||
accum2 = widemul(&aa[3], &aa[3]);
|
||||
msb(&accum0, &a[3], &a[3]);
|
||||
mac(&accum1, &a[7], &a[7]);
|
||||
|
||||
accum1 += accum2;
|
||||
accum0 += accum2;
|
||||
|
||||
accum2 = widemul2(&a[0],&a[2]);
|
||||
accum2 = widemul2(&a[0], &a[2]);
|
||||
mac2(&accum1, &aa[0], &aa[2]);
|
||||
mac2(&accum0, &a[4], &a[6]);
|
||||
|
||||
|
@ -14,60 +14,63 @@
|
||||
#define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
|
||||
#define LIMB_PLACE_VALUE(i) 56
|
||||
|
||||
void gf_add_RAW (gf out, const gf a, const gf b) {
|
||||
for (unsigned int i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
|
||||
((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
|
||||
void gf_add_RAW(gf out, const gf a, const gf b)
|
||||
{
|
||||
for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint64xn_t); i++) {
|
||||
((uint64xn_t *) out)[i] =
|
||||
((const uint64xn_t *)a)[i] + ((const uint64xn_t *)b)[i];
|
||||
}
|
||||
/*
|
||||
unsigned int i;
|
||||
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
|
||||
out->limb[i] = a->limb[i] + b->limb[i];
|
||||
}
|
||||
*/
|
||||
* unsigned int i; for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
|
||||
* out->limb[i] = a->limb[i] + b->limb[i]; }
|
||||
*/
|
||||
}
|
||||
|
||||
void gf_sub_RAW (gf out, const gf a, const gf b) {
|
||||
for (unsigned int i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
|
||||
((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i];
|
||||
void gf_sub_RAW(gf out, const gf a, const gf b)
|
||||
{
|
||||
for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint64xn_t); i++) {
|
||||
((uint64xn_t *) out)[i] =
|
||||
((const uint64xn_t *)a)[i] - ((const uint64xn_t *)b)[i];
|
||||
}
|
||||
/*
|
||||
unsigned int i;
|
||||
for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
|
||||
out->limb[i] = a->limb[i] - b->limb[i];
|
||||
}
|
||||
*/
|
||||
* unsigned int i; for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
|
||||
* out->limb[i] = a->limb[i] - b->limb[i]; }
|
||||
*/
|
||||
}
|
||||
|
||||
void gf_bias (gf a, int amt) {
|
||||
uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt;
|
||||
|
||||
void gf_bias(gf a, int amt)
|
||||
{
|
||||
uint64_t co1 = ((1ull << 56) - 1) * amt, co2 = co1 - amt;
|
||||
|
||||
#if __AVX2__
|
||||
uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
|
||||
uint64x4_t *aa = (uint64x4_t*) a;
|
||||
uint64x4_t lo = { co1, co1, co1, co1 }, hi = {
|
||||
co2, co1, co1, co1};
|
||||
uint64x4_t *aa = (uint64x4_t *) a;
|
||||
aa[0] += lo;
|
||||
aa[1] += hi;
|
||||
#elif __SSE2__
|
||||
uint64x2_t lo = {co1,co1}, hi = {co2,co1};
|
||||
uint64x2_t *aa = (uint64x2_t*) a;
|
||||
uint64x2_t lo = { co1, co1 }, hi = {
|
||||
co2, co1};
|
||||
uint64x2_t *aa = (uint64x2_t *) a;
|
||||
aa[0] += lo;
|
||||
aa[1] += lo;
|
||||
aa[2] += hi;
|
||||
aa[3] += lo;
|
||||
#else
|
||||
for (unsigned int i=0; i<sizeof(*a)/sizeof(uint64_t); i++) {
|
||||
a->limb[i] += (i==4) ? co2 : co1;
|
||||
for (unsigned int i = 0; i < sizeof(*a) / sizeof(uint64_t); i++) {
|
||||
a->limb[i] += (i == 4) ? co2 : co1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void gf_weak_reduce (gf a) {
|
||||
void gf_weak_reduce(gf a)
|
||||
{
|
||||
/* PERF: use pshufb/palignr if anyone cares about speed of this */
|
||||
uint64_t mask = (1ull<<56) - 1;
|
||||
uint64_t mask = (1ull << 56) - 1;
|
||||
uint64_t tmp = a->limb[7] >> 56;
|
||||
a->limb[4] += tmp;
|
||||
for (unsigned int i=7; i>0; i--) {
|
||||
a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>56);
|
||||
for (unsigned int i = 7; i > 0; i--) {
|
||||
a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56);
|
||||
}
|
||||
a->limb[0] = (a->limb[0] & mask) + tmp;
|
||||
}
|
||||
|
||||
|
@ -11,10 +11,10 @@
|
||||
*/
|
||||
|
||||
#ifndef __CONSTANT_TIME_H__
|
||||
#define __CONSTANT_TIME_H__ 1
|
||||
# define __CONSTANT_TIME_H__ 1
|
||||
|
||||
#include "word.h"
|
||||
#include <string.h>
|
||||
# include "word.h"
|
||||
# include <string.h>
|
||||
|
||||
/*
|
||||
* Constant-time operations on hopefully-compile-time-sized memory
|
||||
@ -36,20 +36,19 @@
|
||||
* Instead, we're putting our trust in the loop unroller and unswitcher.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Unaligned big (vector?) register.
|
||||
*/
|
||||
typedef struct {
|
||||
big_register_t unaligned;
|
||||
} __attribute__((packed)) unaligned_br_t;
|
||||
} __attribute__ ((packed)) unaligned_br_t;
|
||||
|
||||
/**
|
||||
* Unaligned word register, for architectures where that matters.
|
||||
*/
|
||||
typedef struct {
|
||||
word_t unaligned;
|
||||
} __attribute__((packed)) unaligned_word_t;
|
||||
} __attribute__ ((packed)) unaligned_word_t;
|
||||
|
||||
/**
|
||||
* @brief Constant-time conditional swap.
|
||||
@ -60,62 +59,58 @@ typedef struct {
|
||||
* as their sizes, if the CPU cares about that sort of thing.
|
||||
*/
|
||||
static __inline__ void
|
||||
__attribute__((unused,always_inline))
|
||||
constant_time_cond_swap (
|
||||
void *__restrict__ a_,
|
||||
void *__restrict__ b_,
|
||||
word_t elem_bytes,
|
||||
mask_t doswap
|
||||
) {
|
||||
__attribute__ ((unused, always_inline))
|
||||
constant_time_cond_swap(void *__restrict__ a_,
|
||||
void *__restrict__ b_, word_t elem_bytes, mask_t doswap)
|
||||
{
|
||||
word_t k;
|
||||
unsigned char *a = (unsigned char *)a_;
|
||||
unsigned char *b = (unsigned char *)b_;
|
||||
|
||||
|
||||
big_register_t br_mask = br_set_to_mask(doswap);
|
||||
for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) {
|
||||
for (k = 0; k <= elem_bytes - sizeof(big_register_t);
|
||||
k += sizeof(big_register_t)) {
|
||||
if (elem_bytes % sizeof(big_register_t)) {
|
||||
/* unaligned */
|
||||
big_register_t xor =
|
||||
((unaligned_br_t*)(&a[k]))->unaligned
|
||||
^ ((unaligned_br_t*)(&b[k]))->unaligned;
|
||||
((unaligned_br_t *) (&a[k]))->unaligned
|
||||
^ ((unaligned_br_t *) (&b[k]))->unaligned;
|
||||
xor &= br_mask;
|
||||
((unaligned_br_t*)(&a[k]))->unaligned ^= xor;
|
||||
((unaligned_br_t*)(&b[k]))->unaligned ^= xor;
|
||||
((unaligned_br_t *) (&a[k]))->unaligned ^= xor;
|
||||
((unaligned_br_t *) (&b[k]))->unaligned ^= xor;
|
||||
} else {
|
||||
/* aligned */
|
||||
big_register_t xor =
|
||||
*((big_register_t*)(&a[k]))
|
||||
^ *((big_register_t*)(&b[k]));
|
||||
big_register_t xor = *((big_register_t *) (&a[k]))
|
||||
^ *((big_register_t *) (&b[k]));
|
||||
xor &= br_mask;
|
||||
*((big_register_t*)(&a[k])) ^= xor;
|
||||
*((big_register_t*)(&b[k])) ^= xor;
|
||||
*((big_register_t *) (&a[k])) ^= xor;
|
||||
*((big_register_t *) (&b[k])) ^= xor;
|
||||
}
|
||||
}
|
||||
|
||||
if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
|
||||
for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) {
|
||||
for (; k <= elem_bytes - sizeof(word_t); k += sizeof(word_t)) {
|
||||
if (elem_bytes % sizeof(word_t)) {
|
||||
/* unaligned */
|
||||
word_t xor =
|
||||
((unaligned_word_t*)(&a[k]))->unaligned
|
||||
^ ((unaligned_word_t*)(&b[k]))->unaligned;
|
||||
((unaligned_word_t *) (&a[k]))->unaligned
|
||||
^ ((unaligned_word_t *) (&b[k]))->unaligned;
|
||||
xor &= doswap;
|
||||
((unaligned_word_t*)(&a[k]))->unaligned ^= xor;
|
||||
((unaligned_word_t*)(&b[k]))->unaligned ^= xor;
|
||||
((unaligned_word_t *) (&a[k]))->unaligned ^= xor;
|
||||
((unaligned_word_t *) (&b[k]))->unaligned ^= xor;
|
||||
} else {
|
||||
/* aligned */
|
||||
word_t xor =
|
||||
*((word_t*)(&a[k]))
|
||||
^ *((word_t*)(&b[k]));
|
||||
word_t xor = *((word_t *) (&a[k]))
|
||||
^ *((word_t *) (&b[k]));
|
||||
xor &= doswap;
|
||||
*((word_t*)(&a[k])) ^= xor;
|
||||
*((word_t*)(&b[k])) ^= xor;
|
||||
*((word_t *) (&a[k])) ^= xor;
|
||||
*((word_t *) (&b[k])) ^= xor;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (elem_bytes % sizeof(word_t)) {
|
||||
for (; k<elem_bytes; k+=1) {
|
||||
for (; k < elem_bytes; k += 1) {
|
||||
unsigned char xor = a[k] ^ b[k];
|
||||
xor &= doswap;
|
||||
a[k] ^= xor;
|
||||
@ -133,53 +128,60 @@ constant_time_cond_swap (
|
||||
* The table and output must not alias.
|
||||
*/
|
||||
static __inline__ void
|
||||
__attribute__((unused,always_inline))
|
||||
constant_time_lookup (
|
||||
void *__restrict__ out_,
|
||||
const void *table_,
|
||||
word_t elem_bytes,
|
||||
word_t n_table,
|
||||
word_t idx
|
||||
) {
|
||||
__attribute__ ((unused, always_inline))
|
||||
constant_time_lookup(void *__restrict__ out_,
|
||||
const void *table_,
|
||||
word_t elem_bytes, word_t n_table, word_t idx)
|
||||
{
|
||||
big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
|
||||
|
||||
|
||||
/* Can't do pointer arithmetic on void* */
|
||||
unsigned char *out = (unsigned char *)out_;
|
||||
const unsigned char *table = (const unsigned char *)table_;
|
||||
word_t j,k;
|
||||
|
||||
word_t j, k;
|
||||
|
||||
memset(out, 0, elem_bytes);
|
||||
for (j=0; j<n_table; j++, big_i-=big_one) {
|
||||
for (j = 0; j < n_table; j++, big_i -= big_one) {
|
||||
big_register_t br_mask = br_is_zero(big_i);
|
||||
word_t mask;
|
||||
|
||||
for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) {
|
||||
for (k = 0; k <= elem_bytes - sizeof(big_register_t);
|
||||
k += sizeof(big_register_t)) {
|
||||
if (elem_bytes % sizeof(big_register_t)) {
|
||||
/* unaligned */
|
||||
((unaligned_br_t *)(out+k))->unaligned
|
||||
|= br_mask & ((const unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned;
|
||||
((unaligned_br_t *) (out + k))->unaligned
|
||||
|=
|
||||
br_mask &
|
||||
((const unaligned_br_t
|
||||
*)(&table[k + j * elem_bytes]))->unaligned;
|
||||
} else {
|
||||
/* aligned */
|
||||
*(big_register_t *)(out+k) |= br_mask & *(const big_register_t*)(&table[k+j*elem_bytes]);
|
||||
*(big_register_t *) (out + k) |=
|
||||
br_mask & *(const big_register_t
|
||||
*)(&table[k + j * elem_bytes]);
|
||||
}
|
||||
}
|
||||
|
||||
mask = word_is_zero(idx^j);
|
||||
mask = word_is_zero(idx ^ j);
|
||||
if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
|
||||
for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) {
|
||||
for (; k <= elem_bytes - sizeof(word_t); k += sizeof(word_t)) {
|
||||
if (elem_bytes % sizeof(word_t)) {
|
||||
/* input unaligned, output aligned */
|
||||
*(word_t *)(out+k) |= mask & ((const unaligned_word_t*)(&table[k+j*elem_bytes]))->unaligned;
|
||||
*(word_t *) (out + k) |=
|
||||
mask &
|
||||
((const unaligned_word_t
|
||||
*)(&table[k + j * elem_bytes]))->unaligned;
|
||||
} else {
|
||||
/* aligned */
|
||||
*(word_t *)(out+k) |= mask & *(const word_t*)(&table[k+j*elem_bytes]);
|
||||
*(word_t *) (out + k) |=
|
||||
mask & *(const word_t *)(&table[k + j * elem_bytes]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (elem_bytes % sizeof(word_t)) {
|
||||
for (; k<elem_bytes; k+=1) {
|
||||
out[k] |= mask & table[k+j*elem_bytes];
|
||||
for (; k < elem_bytes; k += 1) {
|
||||
out[k] |= mask & table[k + j * elem_bytes];
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -195,58 +197,57 @@ constant_time_lookup (
|
||||
* input, it must be equal and not partially overlap.
|
||||
*/
|
||||
static __inline__ void
|
||||
__attribute__((unused,always_inline))
|
||||
constant_time_select (
|
||||
void *a_,
|
||||
const void *bFalse_,
|
||||
const void *bTrue_,
|
||||
word_t elem_bytes,
|
||||
mask_t mask,
|
||||
size_t alignment_bytes
|
||||
) {
|
||||
__attribute__ ((unused, always_inline))
|
||||
constant_time_select(void *a_,
|
||||
const void *bFalse_,
|
||||
const void *bTrue_,
|
||||
word_t elem_bytes, mask_t mask, size_t alignment_bytes)
|
||||
{
|
||||
unsigned char *a = (unsigned char *)a_;
|
||||
const unsigned char *bTrue = (const unsigned char *)bTrue_;
|
||||
const unsigned char *bFalse = (const unsigned char *)bFalse_;
|
||||
word_t k;
|
||||
big_register_t br_mask = br_set_to_mask(mask);
|
||||
|
||||
|
||||
alignment_bytes |= elem_bytes;
|
||||
|
||||
for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) {
|
||||
for (k = 0; k <= elem_bytes - sizeof(big_register_t);
|
||||
k += sizeof(big_register_t)) {
|
||||
if (alignment_bytes % sizeof(big_register_t)) {
|
||||
/* unaligned */
|
||||
((unaligned_br_t*)(&a[k]))->unaligned =
|
||||
( br_mask & ((const unaligned_br_t*)(&bTrue [k]))->unaligned)
|
||||
| (~br_mask & ((const unaligned_br_t*)(&bFalse[k]))->unaligned);
|
||||
((unaligned_br_t *) (&a[k]))->unaligned =
|
||||
(br_mask & ((const unaligned_br_t *)(&bTrue[k]))->unaligned)
|
||||
| (~br_mask &
|
||||
((const unaligned_br_t *)(&bFalse[k]))->unaligned);
|
||||
} else {
|
||||
/* aligned */
|
||||
*(big_register_t *)(a+k) =
|
||||
( br_mask & *(const big_register_t*)(&bTrue [k]))
|
||||
| (~br_mask & *(const big_register_t*)(&bFalse[k]));
|
||||
*(big_register_t *) (a + k) =
|
||||
(br_mask & *(const big_register_t *)(&bTrue[k]))
|
||||
| (~br_mask & *(const big_register_t *)(&bFalse[k]));
|
||||
}
|
||||
}
|
||||
|
||||
if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
|
||||
for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) {
|
||||
for (; k <= elem_bytes - sizeof(word_t); k += sizeof(word_t)) {
|
||||
if (alignment_bytes % sizeof(word_t)) {
|
||||
/* unaligned */
|
||||
((unaligned_word_t*)(&a[k]))->unaligned =
|
||||
( mask & ((const unaligned_word_t*)(&bTrue [k]))->unaligned)
|
||||
| (~mask & ((const unaligned_word_t*)(&bFalse[k]))->unaligned);
|
||||
((unaligned_word_t *) (&a[k]))->unaligned =
|
||||
(mask & ((const unaligned_word_t *)(&bTrue[k]))->unaligned)
|
||||
| (~mask &
|
||||
((const unaligned_word_t *)(&bFalse[k]))->unaligned);
|
||||
} else {
|
||||
/* aligned */
|
||||
*(word_t *)(a+k) =
|
||||
( mask & *(const word_t*)(&bTrue [k]))
|
||||
| (~mask & *(const word_t*)(&bFalse[k]));
|
||||
*(word_t *) (a + k) = (mask & *(const word_t *)(&bTrue[k]))
|
||||
| (~mask & *(const word_t *)(&bFalse[k]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (elem_bytes % sizeof(word_t)) {
|
||||
for (; k<elem_bytes; k+=1) {
|
||||
a[k] = ( mask & bTrue[k]) | (~mask & bFalse[k]);
|
||||
for (; k < elem_bytes; k += 1) {
|
||||
a[k] = (mask & bTrue[k]) | (~mask & bFalse[k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* __CONSTANT_TIME_H__ */
|
||||
#endif /* __CONSTANT_TIME_H__ */
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -72,24 +72,31 @@ const uint8_t in_u3[56] = {
|
||||
|
||||
const uint8_t out_u3[3][56] = {
|
||||
{
|
||||
0x3f, 0x48, 0x2c, 0x8a, 0x9f, 0x19, 0xb0, 0x1e, 0x6c, 0x46, 0xee, 0x97,
|
||||
0x11, 0xd9, 0xdc, 0x14, 0xfd, 0x4b, 0xf6, 0x7a, 0xf3, 0x07, 0x65, 0xc2,
|
||||
0xae, 0x2b, 0x84, 0x6a, 0x4d, 0x23, 0xa8, 0xcd, 0x0d, 0xb8, 0x97, 0x08,
|
||||
0x62, 0x39, 0x49, 0x2c, 0xaf, 0x35, 0x0b, 0x51, 0xf8, 0x33, 0x86, 0x8b,
|
||||
0x9b, 0xc2, 0xb3, 0xbc, 0xa9, 0xcf, 0x41, 0x13
|
||||
}, {
|
||||
0xaa, 0x3b, 0x47, 0x49, 0xd5, 0x5b, 0x9d, 0xaf, 0x1e, 0x5b, 0x00, 0x28,
|
||||
0x88, 0x26, 0xc4, 0x67, 0x27, 0x4c, 0xe3, 0xeb, 0xbd, 0xd5, 0xc1, 0x7b,
|
||||
0x97, 0x5e, 0x09, 0xd4, 0xaf, 0x6c, 0x67, 0xcf, 0x10, 0xd0, 0x87, 0x20,
|
||||
0x2d, 0xb8, 0x82, 0x86, 0xe2, 0xb7, 0x9f, 0xce, 0xea, 0x3e, 0xc3, 0x53,
|
||||
0xef, 0x54, 0xfa, 0xa2, 0x6e, 0x21, 0x9f, 0x38
|
||||
}, {
|
||||
0x07, 0x7f, 0x45, 0x36, 0x81, 0xca, 0xca, 0x36, 0x93, 0x19, 0x84, 0x20,
|
||||
0xbb, 0xe5, 0x15, 0xca, 0xe0, 0x00, 0x24, 0x72, 0x51, 0x9b, 0x3e, 0x67,
|
||||
0x66, 0x1a, 0x7e, 0x89, 0xca, 0xb9, 0x46, 0x95, 0xc8, 0xf4, 0xbc, 0xd6,
|
||||
0x6e, 0x61, 0xb9, 0xb9, 0xc9, 0x46, 0xda, 0x8d, 0x52, 0x4d, 0xe3, 0xd6,
|
||||
0x9b, 0xd9, 0xd9, 0xd6, 0x6b, 0x99, 0x7e, 0x37
|
||||
}
|
||||
0x3f, 0x48, 0x2c, 0x8a, 0x9f, 0x19, 0xb0, 0x1e, 0x6c, 0x46, 0xee, 0x97,
|
||||
0x11, 0xd9, 0xdc, 0x14, 0xfd, 0x4b, 0xf6, 0x7a, 0xf3, 0x07, 0x65, 0xc2,
|
||||
0xae, 0x2b, 0x84, 0x6a, 0x4d, 0x23, 0xa8, 0xcd, 0x0d, 0xb8, 0x97, 0x08,
|
||||
0x62, 0x39, 0x49, 0x2c, 0xaf, 0x35, 0x0b, 0x51, 0xf8, 0x33, 0x86, 0x8b,
|
||||
0x9b, 0xc2, 0xb3, 0xbc, 0xa9, 0xcf, 0x41, 0x13}, {
|
||||
0xaa, 0x3b, 0x47, 0x49,
|
||||
0xd5, 0x5b, 0x9d, 0xaf,
|
||||
0x1e, 0x5b, 0x00, 0x28,
|
||||
0x88, 0x26, 0xc4, 0x67,
|
||||
0x27, 0x4c, 0xe3, 0xeb,
|
||||
0xbd, 0xd5, 0xc1, 0x7b,
|
||||
0x97, 0x5e, 0x09, 0xd4,
|
||||
0xaf, 0x6c, 0x67, 0xcf,
|
||||
0x10, 0xd0, 0x87, 0x20,
|
||||
0x2d, 0xb8, 0x82, 0x86,
|
||||
0xe2, 0xb7, 0x9f, 0xce,
|
||||
0xea, 0x3e, 0xc3, 0x53,
|
||||
0xef, 0x54, 0xfa, 0xa2,
|
||||
0x6e, 0x21, 0x9f, 0x38},
|
||||
{
|
||||
0x07, 0x7f, 0x45, 0x36, 0x81, 0xca, 0xca, 0x36, 0x93, 0x19, 0x84, 0x20,
|
||||
0xbb, 0xe5, 0x15, 0xca, 0xe0, 0x00, 0x24, 0x72, 0x51, 0x9b, 0x3e, 0x67,
|
||||
0x66, 0x1a, 0x7e, 0x89, 0xca, 0xb9, 0x46, 0x95, 0xc8, 0xf4, 0xbc, 0xd6,
|
||||
0x6e, 0x61, 0xb9, 0xb9, 0xc9, 0x46, 0xda, 0x8d, 0x52, 0x4d, 0xe3, 0xd6,
|
||||
0x9b, 0xd9, 0xd9, 0xd6, 0x6b, 0x99, 0x7e, 0x37}
|
||||
};
|
||||
|
||||
/* Test vectors from RFC8032 for Ed448 */
|
||||
@ -583,14 +590,13 @@ static const uint8_t *dohash(EVP_MD_CTX *hashctx, const uint8_t *msg,
|
||||
static uint8_t hashout[64];
|
||||
|
||||
if (!EVP_DigestInit_ex(hashctx, EVP_shake256(), NULL)
|
||||
|| !EVP_DigestUpdate(hashctx, msg, msglen)
|
||||
|| !EVP_DigestFinalXOF(hashctx, hashout, sizeof(hashout)))
|
||||
|| !EVP_DigestUpdate(hashctx, msg, msglen)
|
||||
|| !EVP_DigestFinalXOF(hashctx, hashout, sizeof(hashout)))
|
||||
return NULL;
|
||||
|
||||
return hashout;
|
||||
}
|
||||
|
||||
|
||||
static int test_eddsa(void)
|
||||
{
|
||||
uint8_t outsig[114];
|
||||
@ -614,7 +620,8 @@ static int test_eddsa(void)
|
||||
goto err;
|
||||
}
|
||||
|
||||
ED448_sign(outsig, msg3, sizeof(msg3), pubkey3, privkey3, context3, sizeof(context3));
|
||||
ED448_sign(outsig, msg3, sizeof(msg3), pubkey3, privkey3, context3,
|
||||
sizeof(context3));
|
||||
if (memcmp(sig3, outsig, sizeof(sig3)) != 0) {
|
||||
printf("Calculated sig and expected sig differ (3)\n");
|
||||
goto err;
|
||||
@ -683,7 +690,7 @@ int main(int argc, char *argv[])
|
||||
int j = -1;
|
||||
|
||||
if (argc != 1 && (argc != 2 || strcmp(argv[1], "-f") != 0)) {
|
||||
printf ("Usage: curve448_test [-f]\n");
|
||||
printf("Usage: curve448_test [-f]\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -725,8 +732,9 @@ int main(int argc, char *argv[])
|
||||
if (i == 1 || i == 1000 || i == 1000000) {
|
||||
j++;
|
||||
if (memcmp(out, out_u3[j], sizeof(out)) != 0) {
|
||||
printf("Calculated output and expected output differ (3, %ud)\n",
|
||||
i);
|
||||
printf
|
||||
("Calculated output and expected output differ (3, %ud)\n",
|
||||
i);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
@ -11,47 +11,47 @@
|
||||
*/
|
||||
|
||||
#ifndef __DECAF_COMMON_H__
|
||||
#define __DECAF_COMMON_H__ 1
|
||||
# define __DECAF_COMMON_H__ 1
|
||||
|
||||
#include <openssl/e_os2.h>
|
||||
# include <openssl/e_os2.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Internal word types.
|
||||
*
|
||||
* Somewhat tricky. This could be decided separately per platform. However,
|
||||
* the structs do need to be all the same size and alignment on a given
|
||||
* platform to support dynamic linking, since even if you header was built
|
||||
* with eg arch_neon, you might end up linking a library built with arch_arm32.
|
||||
/*
|
||||
* Internal word types. Somewhat tricky. This could be decided separately per
|
||||
* platform. However, the structs do need to be all the same size and
|
||||
* alignment on a given platform to support dynamic linking, since even if you
|
||||
* header was built with eg arch_neon, you might end up linking a library built
|
||||
* with arch_arm32.
|
||||
*/
|
||||
#ifndef DECAF_WORD_BITS
|
||||
#if (defined(__ILP64__) || defined(__amd64__) || defined(__x86_64__) || (((__UINT_FAST32_MAX__)>>30)>>30))
|
||||
#define DECAF_WORD_BITS 64 /**< The number of bits in a word */
|
||||
#else
|
||||
#define DECAF_WORD_BITS 32 /**< The number of bits in a word */
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if DECAF_WORD_BITS == 64
|
||||
# ifndef DECAF_WORD_BITS
|
||||
# if (defined(__ILP64__) || defined(__amd64__) || defined(__x86_64__) || (((__UINT_FAST32_MAX__)>>30)>>30))
|
||||
# define DECAF_WORD_BITS 64 /**< The number of bits in a word */
|
||||
# else
|
||||
# define DECAF_WORD_BITS 32 /**< The number of bits in a word */
|
||||
# endif
|
||||
# endif
|
||||
|
||||
# if DECAF_WORD_BITS == 64
|
||||
typedef uint64_t decaf_word_t; /**< Word size for internal computations */
|
||||
typedef int64_t decaf_sword_t; /**< Signed word size for internal computations */
|
||||
typedef uint64_t decaf_bool_t; /**< "Boolean" type, will be set to all-zero or all-one (i.e. -1u) */
|
||||
typedef __uint128_t decaf_dword_t; /**< Double-word size for internal computations */
|
||||
typedef __int128_t decaf_dsword_t; /**< Signed double-word size for internal computations */
|
||||
#elif DECAF_WORD_BITS == 32 /**< The number of bits in a word */
|
||||
# elif DECAF_WORD_BITS == 32 /**< The number of bits in a word */
|
||||
typedef uint32_t decaf_word_t; /**< Word size for internal computations */
|
||||
typedef int32_t decaf_sword_t; /**< Signed word size for internal computations */
|
||||
typedef uint32_t decaf_bool_t; /**< "Boolean" type, will be set to all-zero or all-one (i.e. -1u) */
|
||||
typedef uint64_t decaf_dword_t; /**< Double-word size for internal computations */
|
||||
typedef int64_t decaf_dsword_t; /**< Signed double-word size for internal computations */
|
||||
#else
|
||||
#error "Only supporting DECAF_WORD_BITS = 32 or 64 for now"
|
||||
#endif
|
||||
|
||||
# else
|
||||
# error "Only supporting DECAF_WORD_BITS = 32 or 64 for now"
|
||||
# endif
|
||||
|
||||
/** DECAF_TRUE = -1 so that DECAF_TRUE & x = x */
|
||||
static const decaf_bool_t DECAF_TRUE = -(decaf_bool_t)1;
|
||||
static const decaf_bool_t DECAF_TRUE = -(decaf_bool_t) 1;
|
||||
|
||||
/** DECAF_FALSE = 0 so that DECAF_FALSE & x = 0 */
|
||||
static const decaf_bool_t DECAF_FALSE = 0;
|
||||
@ -62,22 +62,21 @@ typedef enum {
|
||||
DECAF_FAILURE = 0 /**< The operation failed. */
|
||||
} decaf_error_t;
|
||||
|
||||
|
||||
/** Return success if x is true */
|
||||
static ossl_inline decaf_error_t
|
||||
decaf_succeed_if(decaf_bool_t x) {
|
||||
return (decaf_error_t)x;
|
||||
static ossl_inline decaf_error_t decaf_succeed_if(decaf_bool_t x)
|
||||
{
|
||||
return (decaf_error_t) x;
|
||||
}
|
||||
|
||||
/** Return DECAF_TRUE iff x == DECAF_SUCCESS */
|
||||
static ossl_inline decaf_bool_t
|
||||
decaf_successful(decaf_error_t e) {
|
||||
decaf_dword_t w = ((decaf_word_t)e) ^ ((decaf_word_t)DECAF_SUCCESS);
|
||||
return (w-1)>>DECAF_WORD_BITS;
|
||||
static ossl_inline decaf_bool_t decaf_successful(decaf_error_t e)
|
||||
{
|
||||
decaf_dword_t w = ((decaf_word_t) e) ^ ((decaf_word_t) DECAF_SUCCESS);
|
||||
return (w - 1) >> DECAF_WORD_BITS;
|
||||
}
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* __DECAF_COMMON_H__ */
|
||||
|
||||
#endif /* __DECAF_COMMON_H__ */
|
||||
|
@ -11,31 +11,31 @@
|
||||
*/
|
||||
|
||||
#ifndef __DECAF_ED448_H__
|
||||
#define __DECAF_ED448_H__ 1
|
||||
# define __DECAF_ED448_H__ 1
|
||||
|
||||
#include "point_448.h"
|
||||
# include "point_448.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/** Number of bytes in an EdDSA public key. */
|
||||
#define DECAF_EDDSA_448_PUBLIC_BYTES 57
|
||||
# define DECAF_EDDSA_448_PUBLIC_BYTES 57
|
||||
|
||||
/** Number of bytes in an EdDSA private key. */
|
||||
#define DECAF_EDDSA_448_PRIVATE_BYTES DECAF_EDDSA_448_PUBLIC_BYTES
|
||||
# define DECAF_EDDSA_448_PRIVATE_BYTES DECAF_EDDSA_448_PUBLIC_BYTES
|
||||
|
||||
/** Number of bytes in an EdDSA private key. */
|
||||
#define DECAF_EDDSA_448_SIGNATURE_BYTES (DECAF_EDDSA_448_PUBLIC_BYTES + DECAF_EDDSA_448_PRIVATE_BYTES)
|
||||
# define DECAF_EDDSA_448_SIGNATURE_BYTES (DECAF_EDDSA_448_PUBLIC_BYTES + DECAF_EDDSA_448_PRIVATE_BYTES)
|
||||
|
||||
/** Does EdDSA support non-contextual signatures? */
|
||||
#define DECAF_EDDSA_448_SUPPORTS_CONTEXTLESS_SIGS 0
|
||||
# define DECAF_EDDSA_448_SUPPORTS_CONTEXTLESS_SIGS 0
|
||||
|
||||
/** EdDSA encoding ratio. */
|
||||
#define DECAF_448_EDDSA_ENCODE_RATIO 4
|
||||
# define DECAF_448_EDDSA_ENCODE_RATIO 4
|
||||
|
||||
/** EdDSA decoding ratio. */
|
||||
#define DECAF_448_EDDSA_DECODE_RATIO (4 / 4)
|
||||
# define DECAF_448_EDDSA_DECODE_RATIO (4 / 4)
|
||||
|
||||
/**
|
||||
* @brief EdDSA key generation. This function uses a different (non-Decaf)
|
||||
@ -43,11 +43,14 @@ extern "C" {
|
||||
*
|
||||
* @param [out] pubkey The public key.
|
||||
* @param [in] privkey The private key.
|
||||
*/
|
||||
decaf_error_t decaf_ed448_derive_public_key (
|
||||
uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES]
|
||||
);
|
||||
*/
|
||||
decaf_error_t decaf_ed448_derive_public_key(uint8_t
|
||||
pubkey
|
||||
[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t
|
||||
privkey
|
||||
[DECAF_EDDSA_448_PRIVATE_BYTES]
|
||||
);
|
||||
|
||||
/**
|
||||
* @brief EdDSA signing.
|
||||
@ -65,17 +68,17 @@ decaf_error_t decaf_ed448_derive_public_key (
|
||||
* messages, at least without some very careful protocol-level disambiguation. For Ed448 it is
|
||||
* safe. The C++ wrapper is designed to make it harder to screw this up, but this C code gives
|
||||
* you no seat belt.
|
||||
*/
|
||||
decaf_error_t decaf_ed448_sign (
|
||||
uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
|
||||
const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
|
||||
const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t *message,
|
||||
size_t message_len,
|
||||
uint8_t prehashed,
|
||||
const uint8_t *context,
|
||||
size_t context_len
|
||||
) __attribute__((nonnull(1,2,3)));
|
||||
*/
|
||||
decaf_error_t decaf_ed448_sign(uint8_t
|
||||
signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
|
||||
const uint8_t
|
||||
privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
|
||||
const uint8_t
|
||||
pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t *message, size_t message_len,
|
||||
uint8_t prehashed, const uint8_t *context,
|
||||
size_t context_len)
|
||||
__attribute__ ((nonnull(1, 2, 3)));
|
||||
|
||||
/**
|
||||
* @brief EdDSA signing with prehash.
|
||||
@ -91,15 +94,18 @@ decaf_error_t decaf_ed448_sign (
|
||||
* messages, at least without some very careful protocol-level disambiguation. For Ed448 it is
|
||||
* safe. The C++ wrapper is designed to make it harder to screw this up, but this C code gives
|
||||
* you no seat belt.
|
||||
*/
|
||||
decaf_error_t decaf_ed448_sign_prehash (
|
||||
uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
|
||||
const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
|
||||
const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t hash[64],
|
||||
const uint8_t *context,
|
||||
size_t context_len
|
||||
) __attribute__((nonnull(1,2,3,4)));
|
||||
*/
|
||||
decaf_error_t decaf_ed448_sign_prehash(uint8_t
|
||||
signature
|
||||
[DECAF_EDDSA_448_SIGNATURE_BYTES],
|
||||
const uint8_t
|
||||
privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
|
||||
const uint8_t
|
||||
pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t hash[64],
|
||||
const uint8_t *context,
|
||||
size_t context_len)
|
||||
__attribute__ ((nonnull(1, 2, 3, 4)));
|
||||
|
||||
/**
|
||||
* @brief EdDSA signature verification.
|
||||
@ -119,15 +125,14 @@ decaf_error_t decaf_ed448_sign_prehash (
|
||||
* safe. The C++ wrapper is designed to make it harder to screw this up, but this C code gives
|
||||
* you no seat belt.
|
||||
*/
|
||||
decaf_error_t decaf_ed448_verify (
|
||||
const uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
|
||||
const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t *message,
|
||||
size_t message_len,
|
||||
uint8_t prehashed,
|
||||
const uint8_t *context,
|
||||
uint8_t context_len
|
||||
) __attribute__((nonnull(1,2)));
|
||||
decaf_error_t decaf_ed448_verify(const uint8_t
|
||||
signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
|
||||
const uint8_t
|
||||
pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t *message, size_t message_len,
|
||||
uint8_t prehashed, const uint8_t *context,
|
||||
uint8_t context_len)
|
||||
__attribute__ ((nonnull(1, 2)));
|
||||
|
||||
/**
|
||||
* @brief EdDSA signature verification.
|
||||
@ -145,13 +150,15 @@ decaf_error_t decaf_ed448_verify (
|
||||
* safe. The C++ wrapper is designed to make it harder to screw this up, but this C code gives
|
||||
* you no seat belt.
|
||||
*/
|
||||
decaf_error_t decaf_ed448_verify_prehash (
|
||||
const uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
|
||||
const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t hash[64],
|
||||
const uint8_t *context,
|
||||
uint8_t context_len
|
||||
) __attribute__((nonnull(1,2)));
|
||||
decaf_error_t decaf_ed448_verify_prehash(const uint8_t
|
||||
signature
|
||||
[DECAF_EDDSA_448_SIGNATURE_BYTES],
|
||||
const uint8_t
|
||||
pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t hash[64],
|
||||
const uint8_t *context,
|
||||
uint8_t context_len)
|
||||
__attribute__ ((nonnull(1, 2)));
|
||||
|
||||
/**
|
||||
* @brief EdDSA point encoding. Used internally, exposed externally.
|
||||
@ -176,11 +183,12 @@ decaf_error_t decaf_ed448_verify_prehash (
|
||||
*
|
||||
* @param [out] enc The encoded point.
|
||||
* @param [in] p The point.
|
||||
*/
|
||||
void curve448_point_mul_by_ratio_and_encode_like_eddsa (
|
||||
uint8_t enc[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const curve448_point_t p
|
||||
);
|
||||
*/
|
||||
void curve448_point_mul_by_ratio_and_encode_like_eddsa(uint8_t
|
||||
enc
|
||||
[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const curve448_point_t
|
||||
p);
|
||||
|
||||
/**
|
||||
* @brief EdDSA point decoding. Multiplies by DECAF_448_EDDSA_DECODE_RATIO,
|
||||
@ -190,11 +198,13 @@ void curve448_point_mul_by_ratio_and_encode_like_eddsa (
|
||||
*
|
||||
* @param [out] enc The encoded point.
|
||||
* @param [in] p The point.
|
||||
*/
|
||||
decaf_error_t curve448_point_decode_like_eddsa_and_mul_by_ratio (
|
||||
curve448_point_t p,
|
||||
const uint8_t enc[DECAF_EDDSA_448_PUBLIC_BYTES]
|
||||
);
|
||||
*/
|
||||
decaf_error_t curve448_point_decode_like_eddsa_and_mul_by_ratio(curve448_point_t
|
||||
p,
|
||||
const uint8_t
|
||||
enc
|
||||
[DECAF_EDDSA_448_PUBLIC_BYTES]
|
||||
);
|
||||
|
||||
/**
|
||||
* @brief EdDSA to ECDH public key conversion
|
||||
@ -207,10 +217,10 @@ decaf_error_t curve448_point_decode_like_eddsa_and_mul_by_ratio (
|
||||
* @param[out] x The ECDH public key as in RFC7748(point on Montgomery curve)
|
||||
* @param[in] ed The EdDSA public key(point on Edwards curve)
|
||||
*/
|
||||
void decaf_ed448_convert_public_key_to_x448 (
|
||||
uint8_t x[DECAF_X448_PUBLIC_BYTES],
|
||||
const uint8_t ed[DECAF_EDDSA_448_PUBLIC_BYTES]
|
||||
);
|
||||
void decaf_ed448_convert_public_key_to_x448(uint8_t x[DECAF_X448_PUBLIC_BYTES],
|
||||
const uint8_t
|
||||
ed[DECAF_EDDSA_448_PUBLIC_BYTES]
|
||||
);
|
||||
|
||||
/**
|
||||
* @brief EdDSA to ECDH private key conversion
|
||||
@ -220,13 +230,16 @@ void decaf_ed448_convert_public_key_to_x448 (
|
||||
* @param[out] x The ECDH private key as in RFC7748
|
||||
* @param[in] ed The EdDSA private key
|
||||
*/
|
||||
decaf_error_t decaf_ed448_convert_private_key_to_x448 (
|
||||
uint8_t x[DECAF_X448_PRIVATE_BYTES],
|
||||
const uint8_t ed[DECAF_EDDSA_448_PRIVATE_BYTES]
|
||||
);
|
||||
decaf_error_t decaf_ed448_convert_private_key_to_x448(uint8_t
|
||||
x
|
||||
[DECAF_X448_PRIVATE_BYTES],
|
||||
const uint8_t
|
||||
ed
|
||||
[DECAF_EDDSA_448_PRIVATE_BYTES]
|
||||
);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* __DECAF_ED448_H__ */
|
||||
#endif /* __DECAF_ED448_H__ */
|
||||
|
@ -27,12 +27,12 @@
|
||||
|
||||
#if NO_CONTEXT
|
||||
const uint8_t NO_CONTEXT_POINTS_HERE = 0;
|
||||
const uint8_t * const DECAF_ED448_NO_CONTEXT = &NO_CONTEXT_POINTS_HERE;
|
||||
const uint8_t *const DECAF_ED448_NO_CONTEXT = &NO_CONTEXT_POINTS_HERE;
|
||||
#endif
|
||||
|
||||
/* EDDSA_BASE_POINT_RATIO = 1 or 2
|
||||
* Because EdDSA25519 is not on E_d but on the isogenous E_sigma_d,
|
||||
* its base point is twice ours.
|
||||
/*
|
||||
* EDDSA_BASE_POINT_RATIO = 1 or 2 Because EdDSA25519 is not on E_d but on the
|
||||
* isogenous E_sigma_d, its base point is twice ours.
|
||||
*/
|
||||
#define EDDSA_BASE_POINT_RATIO (1+EDDSA_USE_SIGMA_ISOGENY) /* TODO: remove */
|
||||
|
||||
@ -45,8 +45,8 @@ static decaf_error_t oneshot_hash(uint8_t *out, size_t outlen,
|
||||
return DECAF_FAILURE;
|
||||
|
||||
if (!EVP_DigestInit_ex(hashctx, EVP_shake256(), NULL)
|
||||
|| !EVP_DigestUpdate(hashctx, in, inlen)
|
||||
|| !EVP_DigestFinalXOF(hashctx, out, outlen)) {
|
||||
|| !EVP_DigestUpdate(hashctx, in, inlen)
|
||||
|| !EVP_DigestFinalXOF(hashctx, out, outlen)) {
|
||||
EVP_MD_CTX_free(hashctx);
|
||||
return DECAF_FAILURE;
|
||||
}
|
||||
@ -55,11 +55,10 @@ static decaf_error_t oneshot_hash(uint8_t *out, size_t outlen,
|
||||
return DECAF_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static void clamp (
|
||||
uint8_t secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES]
|
||||
) {
|
||||
uint8_t hibit = (1<<0)>>1;
|
||||
static void clamp(uint8_t secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES]
|
||||
)
|
||||
{
|
||||
uint8_t hibit = (1 << 0) >> 1;
|
||||
|
||||
/* Blarg */
|
||||
secret_scalar_ser[0] &= -COFACTOR;
|
||||
@ -67,18 +66,17 @@ static void clamp (
|
||||
secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 1] = 0;
|
||||
secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 2] |= 0x80;
|
||||
} else {
|
||||
secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 1] &= hibit-1;
|
||||
secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 1] &= hibit - 1;
|
||||
secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 1] |= hibit;
|
||||
}
|
||||
}
|
||||
|
||||
static decaf_error_t hash_init_with_dom(
|
||||
EVP_MD_CTX *hashctx,
|
||||
uint8_t prehashed,
|
||||
uint8_t for_prehash,
|
||||
const uint8_t *context,
|
||||
size_t context_len
|
||||
) {
|
||||
static decaf_error_t hash_init_with_dom(EVP_MD_CTX *hashctx,
|
||||
uint8_t prehashed,
|
||||
uint8_t for_prehash,
|
||||
const uint8_t *context,
|
||||
size_t context_len)
|
||||
{
|
||||
const char *dom_s = "SigEd448";
|
||||
uint8_t dom[2];
|
||||
|
||||
@ -99,33 +97,38 @@ static decaf_error_t hash_init_with_dom(
|
||||
#endif
|
||||
|
||||
if (!EVP_DigestInit_ex(hashctx, EVP_shake256(), NULL)
|
||||
|| !EVP_DigestUpdate(hashctx, dom_s, strlen(dom_s))
|
||||
|| !EVP_DigestUpdate(hashctx, dom, sizeof(dom))
|
||||
|| !EVP_DigestUpdate(hashctx, context, context_len))
|
||||
|| !EVP_DigestUpdate(hashctx, dom_s, strlen(dom_s))
|
||||
|| !EVP_DigestUpdate(hashctx, dom, sizeof(dom))
|
||||
|| !EVP_DigestUpdate(hashctx, context, context_len))
|
||||
return DECAF_FAILURE;
|
||||
|
||||
return DECAF_SUCCESS;
|
||||
}
|
||||
|
||||
/* In this file because it uses the hash */
|
||||
decaf_error_t decaf_ed448_convert_private_key_to_x448 (
|
||||
uint8_t x[DECAF_X448_PRIVATE_BYTES],
|
||||
const uint8_t ed[DECAF_EDDSA_448_PRIVATE_BYTES]
|
||||
) {
|
||||
decaf_error_t decaf_ed448_convert_private_key_to_x448(uint8_t
|
||||
x
|
||||
[DECAF_X448_PRIVATE_BYTES],
|
||||
const uint8_t
|
||||
ed
|
||||
[DECAF_EDDSA_448_PRIVATE_BYTES]
|
||||
)
|
||||
{
|
||||
/* pass the private key through oneshot_hash function */
|
||||
/* and keep the first DECAF_X448_PRIVATE_BYTES bytes */
|
||||
return oneshot_hash(
|
||||
x,
|
||||
DECAF_X448_PRIVATE_BYTES,
|
||||
ed,
|
||||
DECAF_EDDSA_448_PRIVATE_BYTES
|
||||
);
|
||||
return oneshot_hash(x,
|
||||
DECAF_X448_PRIVATE_BYTES,
|
||||
ed, DECAF_EDDSA_448_PRIVATE_BYTES);
|
||||
}
|
||||
|
||||
decaf_error_t decaf_ed448_derive_public_key (
|
||||
uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES]
|
||||
) {
|
||||
|
||||
decaf_error_t decaf_ed448_derive_public_key(uint8_t
|
||||
pubkey
|
||||
[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t
|
||||
privkey
|
||||
[DECAF_EDDSA_448_PRIVATE_BYTES]
|
||||
)
|
||||
{
|
||||
/* only this much used for keygen */
|
||||
uint8_t secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES];
|
||||
curve448_scalar_t secret_scalar;
|
||||
@ -138,22 +141,25 @@ decaf_error_t decaf_ed448_derive_public_key (
|
||||
}
|
||||
clamp(secret_scalar_ser);
|
||||
|
||||
curve448_scalar_decode_long(secret_scalar, secret_scalar_ser, sizeof(secret_scalar_ser));
|
||||
|
||||
/* Since we are going to mul_by_cofactor during encoding, divide by it here.
|
||||
* However, the EdDSA base point is not the same as the decaf base point if
|
||||
* the sigma isogeny is in use: the EdDSA base point is on Etwist_d/(1-d) and
|
||||
* the decaf base point is on Etwist_d, and when converted it effectively
|
||||
* picks up a factor of 2 from the isogenies. So we might start at 2 instead of 1.
|
||||
curve448_scalar_decode_long(secret_scalar, secret_scalar_ser,
|
||||
sizeof(secret_scalar_ser));
|
||||
|
||||
/*
|
||||
* Since we are going to mul_by_cofactor during encoding, divide by it
|
||||
* here. However, the EdDSA base point is not the same as the decaf base
|
||||
* point if the sigma isogeny is in use: the EdDSA base point is on
|
||||
* Etwist_d/(1-d) and the decaf base point is on Etwist_d, and when
|
||||
* converted it effectively picks up a factor of 2 from the isogenies. So
|
||||
* we might start at 2 instead of 1.
|
||||
*/
|
||||
for (c=1; c<DECAF_448_EDDSA_ENCODE_RATIO; c <<= 1) {
|
||||
curve448_scalar_halve(secret_scalar,secret_scalar);
|
||||
for (c = 1; c < DECAF_448_EDDSA_ENCODE_RATIO; c <<= 1) {
|
||||
curve448_scalar_halve(secret_scalar, secret_scalar);
|
||||
}
|
||||
|
||||
curve448_precomputed_scalarmul(p,curve448_precomputed_base,secret_scalar);
|
||||
|
||||
|
||||
curve448_precomputed_scalarmul(p, curve448_precomputed_base, secret_scalar);
|
||||
|
||||
curve448_point_mul_by_ratio_and_encode_like_eddsa(pubkey, p);
|
||||
|
||||
|
||||
/* Cleanup */
|
||||
curve448_scalar_destroy(secret_scalar);
|
||||
curve448_point_destroy(p);
|
||||
@ -162,21 +168,21 @@ decaf_error_t decaf_ed448_derive_public_key (
|
||||
return DECAF_SUCCESS;
|
||||
}
|
||||
|
||||
decaf_error_t decaf_ed448_sign (
|
||||
uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
|
||||
const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
|
||||
const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t *message,
|
||||
size_t message_len,
|
||||
uint8_t prehashed,
|
||||
const uint8_t *context,
|
||||
size_t context_len
|
||||
) {
|
||||
decaf_error_t decaf_ed448_sign(uint8_t
|
||||
signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
|
||||
const uint8_t
|
||||
privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
|
||||
const uint8_t
|
||||
pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t *message, size_t message_len,
|
||||
uint8_t prehashed, const uint8_t *context,
|
||||
size_t context_len)
|
||||
{
|
||||
curve448_scalar_t secret_scalar;
|
||||
EVP_MD_CTX *hashctx = EVP_MD_CTX_new();
|
||||
decaf_error_t ret = DECAF_FAILURE;
|
||||
curve448_scalar_t nonce_scalar;
|
||||
uint8_t nonce_point[DECAF_EDDSA_448_PUBLIC_BYTES] = {0};
|
||||
uint8_t nonce_point[DECAF_EDDSA_448_PUBLIC_BYTES] = { 0 };
|
||||
unsigned int c;
|
||||
curve448_scalar_t challenge_scalar;
|
||||
|
||||
@ -188,28 +194,28 @@ decaf_error_t decaf_ed448_sign (
|
||||
struct {
|
||||
uint8_t secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES];
|
||||
uint8_t seed[DECAF_EDDSA_448_PRIVATE_BYTES];
|
||||
} __attribute__((packed)) expanded;
|
||||
} __attribute__ ((packed)) expanded;
|
||||
|
||||
if (!oneshot_hash((uint8_t *)&expanded, sizeof(expanded), privkey,
|
||||
DECAF_EDDSA_448_PRIVATE_BYTES))
|
||||
goto err;
|
||||
clamp(expanded.secret_scalar_ser);
|
||||
curve448_scalar_decode_long(secret_scalar, expanded.secret_scalar_ser, sizeof(expanded.secret_scalar_ser));
|
||||
|
||||
clamp(expanded.secret_scalar_ser);
|
||||
curve448_scalar_decode_long(secret_scalar, expanded.secret_scalar_ser,
|
||||
sizeof(expanded.secret_scalar_ser));
|
||||
|
||||
/* Hash to create the nonce */
|
||||
if (!hash_init_with_dom(hashctx, prehashed, 0, context, context_len)
|
||||
|| !EVP_DigestUpdate(hashctx, expanded.seed,
|
||||
sizeof(expanded.seed))
|
||||
|| !EVP_DigestUpdate(hashctx, message, message_len)) {
|
||||
|| !EVP_DigestUpdate(hashctx, expanded.seed, sizeof(expanded.seed))
|
||||
|| !EVP_DigestUpdate(hashctx, message, message_len)) {
|
||||
OPENSSL_cleanse(&expanded, sizeof(expanded));
|
||||
goto err;
|
||||
}
|
||||
OPENSSL_cleanse(&expanded, sizeof(expanded));
|
||||
}
|
||||
|
||||
|
||||
/* Decode the nonce */
|
||||
{
|
||||
uint8_t nonce[2*DECAF_EDDSA_448_PRIVATE_BYTES];
|
||||
uint8_t nonce[2 * DECAF_EDDSA_448_PRIVATE_BYTES];
|
||||
|
||||
if (!EVP_DigestFinalXOF(hashctx, nonce, sizeof(nonce)))
|
||||
goto err;
|
||||
@ -222,40 +228,42 @@ decaf_error_t decaf_ed448_sign (
|
||||
curve448_scalar_t nonce_scalar_2;
|
||||
curve448_point_t p;
|
||||
|
||||
curve448_scalar_halve(nonce_scalar_2,nonce_scalar);
|
||||
curve448_scalar_halve(nonce_scalar_2, nonce_scalar);
|
||||
for (c = 2; c < DECAF_448_EDDSA_ENCODE_RATIO; c <<= 1) {
|
||||
curve448_scalar_halve(nonce_scalar_2,nonce_scalar_2);
|
||||
curve448_scalar_halve(nonce_scalar_2, nonce_scalar_2);
|
||||
}
|
||||
|
||||
curve448_precomputed_scalarmul(p,curve448_precomputed_base,nonce_scalar_2);
|
||||
curve448_precomputed_scalarmul(p, curve448_precomputed_base,
|
||||
nonce_scalar_2);
|
||||
curve448_point_mul_by_ratio_and_encode_like_eddsa(nonce_point, p);
|
||||
curve448_point_destroy(p);
|
||||
curve448_scalar_destroy(nonce_scalar_2);
|
||||
}
|
||||
|
||||
{
|
||||
uint8_t challenge[2*DECAF_EDDSA_448_PRIVATE_BYTES];
|
||||
uint8_t challenge[2 * DECAF_EDDSA_448_PRIVATE_BYTES];
|
||||
|
||||
/* Compute the challenge */
|
||||
if (!hash_init_with_dom(hashctx, prehashed, 0, context, context_len)
|
||||
|| !EVP_DigestUpdate(hashctx, nonce_point, sizeof(nonce_point))
|
||||
|| !EVP_DigestUpdate(hashctx, pubkey,
|
||||
DECAF_EDDSA_448_PUBLIC_BYTES)
|
||||
|| !EVP_DigestUpdate(hashctx, message, message_len)
|
||||
|| !EVP_DigestFinalXOF(hashctx, challenge, sizeof(challenge)))
|
||||
|| !EVP_DigestUpdate(hashctx, nonce_point, sizeof(nonce_point))
|
||||
|| !EVP_DigestUpdate(hashctx, pubkey, DECAF_EDDSA_448_PUBLIC_BYTES)
|
||||
|| !EVP_DigestUpdate(hashctx, message, message_len)
|
||||
|| !EVP_DigestFinalXOF(hashctx, challenge, sizeof(challenge)))
|
||||
goto err;
|
||||
|
||||
curve448_scalar_decode_long(challenge_scalar,challenge,sizeof(challenge));
|
||||
OPENSSL_cleanse(challenge,sizeof(challenge));
|
||||
curve448_scalar_decode_long(challenge_scalar, challenge,
|
||||
sizeof(challenge));
|
||||
OPENSSL_cleanse(challenge, sizeof(challenge));
|
||||
}
|
||||
|
||||
curve448_scalar_mul(challenge_scalar,challenge_scalar,secret_scalar);
|
||||
curve448_scalar_add(challenge_scalar,challenge_scalar,nonce_scalar);
|
||||
|
||||
OPENSSL_cleanse(signature,DECAF_EDDSA_448_SIGNATURE_BYTES);
|
||||
memcpy(signature,nonce_point,sizeof(nonce_point));
|
||||
curve448_scalar_encode(&signature[DECAF_EDDSA_448_PUBLIC_BYTES],challenge_scalar);
|
||||
|
||||
|
||||
curve448_scalar_mul(challenge_scalar, challenge_scalar, secret_scalar);
|
||||
curve448_scalar_add(challenge_scalar, challenge_scalar, nonce_scalar);
|
||||
|
||||
OPENSSL_cleanse(signature, DECAF_EDDSA_448_SIGNATURE_BYTES);
|
||||
memcpy(signature, nonce_point, sizeof(nonce_point));
|
||||
curve448_scalar_encode(&signature[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
challenge_scalar);
|
||||
|
||||
curve448_scalar_destroy(secret_scalar);
|
||||
curve448_scalar_destroy(nonce_scalar);
|
||||
curve448_scalar_destroy(challenge_scalar);
|
||||
@ -266,97 +274,103 @@ decaf_error_t decaf_ed448_sign (
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
decaf_error_t decaf_ed448_sign_prehash (
|
||||
uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
|
||||
const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
|
||||
const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t hash[64],
|
||||
const uint8_t *context,
|
||||
size_t context_len
|
||||
) {
|
||||
return decaf_ed448_sign(signature,privkey,pubkey,hash,64,1,context,
|
||||
decaf_error_t decaf_ed448_sign_prehash(uint8_t
|
||||
signature
|
||||
[DECAF_EDDSA_448_SIGNATURE_BYTES],
|
||||
const uint8_t
|
||||
privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
|
||||
const uint8_t
|
||||
pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t hash[64],
|
||||
const uint8_t *context,
|
||||
size_t context_len)
|
||||
{
|
||||
return decaf_ed448_sign(signature, privkey, pubkey, hash, 64, 1, context,
|
||||
context_len);
|
||||
/*OPENSSL_cleanse(hash,sizeof(hash));*/
|
||||
/*
|
||||
* OPENSSL_cleanse(hash,sizeof(hash));
|
||||
*/
|
||||
}
|
||||
|
||||
decaf_error_t decaf_ed448_verify (
|
||||
const uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
|
||||
const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t *message,
|
||||
size_t message_len,
|
||||
uint8_t prehashed,
|
||||
const uint8_t *context,
|
||||
uint8_t context_len
|
||||
) {
|
||||
decaf_error_t decaf_ed448_verify(const uint8_t
|
||||
signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
|
||||
const uint8_t
|
||||
pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t *message, size_t message_len,
|
||||
uint8_t prehashed, const uint8_t *context,
|
||||
uint8_t context_len)
|
||||
{
|
||||
curve448_point_t pk_point, r_point;
|
||||
decaf_error_t error = curve448_point_decode_like_eddsa_and_mul_by_ratio(pk_point,pubkey);
|
||||
decaf_error_t error =
|
||||
curve448_point_decode_like_eddsa_and_mul_by_ratio(pk_point, pubkey);
|
||||
curve448_scalar_t challenge_scalar;
|
||||
curve448_scalar_t response_scalar;
|
||||
unsigned int c;
|
||||
|
||||
if (DECAF_SUCCESS != error) { return error; }
|
||||
|
||||
error = curve448_point_decode_like_eddsa_and_mul_by_ratio(r_point,signature);
|
||||
if (DECAF_SUCCESS != error) { return error; }
|
||||
|
||||
if (DECAF_SUCCESS != error) {
|
||||
return error;
|
||||
}
|
||||
|
||||
error =
|
||||
curve448_point_decode_like_eddsa_and_mul_by_ratio(r_point, signature);
|
||||
if (DECAF_SUCCESS != error) {
|
||||
return error;
|
||||
}
|
||||
|
||||
{
|
||||
/* Compute the challenge */
|
||||
EVP_MD_CTX *hashctx = EVP_MD_CTX_new();
|
||||
uint8_t challenge[2*DECAF_EDDSA_448_PRIVATE_BYTES];
|
||||
uint8_t challenge[2 * DECAF_EDDSA_448_PRIVATE_BYTES];
|
||||
|
||||
if (hashctx == NULL
|
||||
|| !hash_init_with_dom(hashctx, prehashed, 0, context,
|
||||
context_len)
|
||||
|| !EVP_DigestUpdate(hashctx, signature,
|
||||
DECAF_EDDSA_448_PUBLIC_BYTES)
|
||||
|| !EVP_DigestUpdate(hashctx, pubkey,
|
||||
DECAF_EDDSA_448_PUBLIC_BYTES)
|
||||
|| !EVP_DigestUpdate(hashctx, message, message_len)
|
||||
|| !EVP_DigestFinalXOF(hashctx, challenge, sizeof(challenge))) {
|
||||
|| !hash_init_with_dom(hashctx, prehashed, 0, context, context_len)
|
||||
|| !EVP_DigestUpdate(hashctx, signature,
|
||||
DECAF_EDDSA_448_PUBLIC_BYTES)
|
||||
|| !EVP_DigestUpdate(hashctx, pubkey, DECAF_EDDSA_448_PUBLIC_BYTES)
|
||||
|| !EVP_DigestUpdate(hashctx, message, message_len)
|
||||
|| !EVP_DigestFinalXOF(hashctx, challenge, sizeof(challenge))) {
|
||||
EVP_MD_CTX_free(hashctx);
|
||||
return DECAF_FAILURE;
|
||||
}
|
||||
|
||||
EVP_MD_CTX_free(hashctx);
|
||||
curve448_scalar_decode_long(challenge_scalar,challenge,sizeof(challenge));
|
||||
OPENSSL_cleanse(challenge,sizeof(challenge));
|
||||
curve448_scalar_decode_long(challenge_scalar, challenge,
|
||||
sizeof(challenge));
|
||||
OPENSSL_cleanse(challenge, sizeof(challenge));
|
||||
}
|
||||
curve448_scalar_sub(challenge_scalar, curve448_scalar_zero, challenge_scalar);
|
||||
curve448_scalar_sub(challenge_scalar, curve448_scalar_zero,
|
||||
challenge_scalar);
|
||||
|
||||
curve448_scalar_decode_long(
|
||||
response_scalar,
|
||||
&signature[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
DECAF_EDDSA_448_PRIVATE_BYTES
|
||||
);
|
||||
|
||||
for (c=1; c<DECAF_448_EDDSA_DECODE_RATIO; c<<=1) {
|
||||
curve448_scalar_add(response_scalar,response_scalar,response_scalar);
|
||||
curve448_scalar_decode_long(response_scalar,
|
||||
&signature[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
DECAF_EDDSA_448_PRIVATE_BYTES);
|
||||
|
||||
for (c = 1; c < DECAF_448_EDDSA_DECODE_RATIO; c <<= 1) {
|
||||
curve448_scalar_add(response_scalar, response_scalar, response_scalar);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* pk_point = -c(x(P)) + (cx + k)G = kG */
|
||||
curve448_base_double_scalarmul_non_secret(
|
||||
pk_point,
|
||||
response_scalar,
|
||||
pk_point,
|
||||
challenge_scalar
|
||||
);
|
||||
return decaf_succeed_if(curve448_point_eq(pk_point,r_point));
|
||||
curve448_base_double_scalarmul_non_secret(pk_point,
|
||||
response_scalar,
|
||||
pk_point, challenge_scalar);
|
||||
return decaf_succeed_if(curve448_point_eq(pk_point, r_point));
|
||||
}
|
||||
|
||||
|
||||
decaf_error_t decaf_ed448_verify_prehash (
|
||||
const uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
|
||||
const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t hash[64],
|
||||
const uint8_t *context,
|
||||
uint8_t context_len
|
||||
) {
|
||||
decaf_error_t decaf_ed448_verify_prehash(const uint8_t
|
||||
signature
|
||||
[DECAF_EDDSA_448_SIGNATURE_BYTES],
|
||||
const uint8_t
|
||||
pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
|
||||
const uint8_t hash[64],
|
||||
const uint8_t *context,
|
||||
uint8_t context_len)
|
||||
{
|
||||
decaf_error_t ret;
|
||||
|
||||
ret = decaf_ed448_verify(signature,pubkey,hash,64,1,context,context_len);
|
||||
|
||||
|
||||
ret =
|
||||
decaf_ed448_verify(signature, pubkey, hash, 64, 1, context,
|
||||
context_len);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -367,10 +381,9 @@ int ED448_sign(uint8_t *out_sig, const uint8_t *message, size_t message_len,
|
||||
|
||||
return decaf_ed448_sign(out_sig, private_key, public_key, message,
|
||||
message_len, 0, context, context_len)
|
||||
== DECAF_SUCCESS;
|
||||
== DECAF_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
int ED448_verify(const uint8_t *message, size_t message_len,
|
||||
const uint8_t signature[114], const uint8_t public_key[57],
|
||||
const uint8_t *context, size_t context_len)
|
||||
@ -397,8 +410,8 @@ int ED448ph_verify(const uint8_t hash[64], const uint8_t signature[114],
|
||||
}
|
||||
|
||||
int ED448_public_from_private(uint8_t out_public_key[57],
|
||||
const uint8_t private_key[57])
|
||||
const uint8_t private_key[57])
|
||||
{
|
||||
return decaf_ed448_derive_public_key(out_public_key, private_key)
|
||||
== DECAF_SUCCESS;
|
||||
== DECAF_SUCCESS;
|
||||
}
|
||||
|
@ -12,37 +12,35 @@
|
||||
|
||||
#include "field.h"
|
||||
|
||||
mask_t gf_isr (
|
||||
gf a,
|
||||
const gf x
|
||||
) {
|
||||
mask_t gf_isr(gf a, const gf x)
|
||||
{
|
||||
gf L0, L1, L2;
|
||||
gf_sqr (L1, x );
|
||||
gf_mul (L2, x, L1 );
|
||||
gf_sqr (L1, L2 );
|
||||
gf_mul (L2, x, L1 );
|
||||
gf_sqrn (L1, L2, 3 );
|
||||
gf_mul (L0, L2, L1 );
|
||||
gf_sqrn (L1, L0, 3 );
|
||||
gf_mul (L0, L2, L1 );
|
||||
gf_sqrn (L2, L0, 9 );
|
||||
gf_mul (L1, L0, L2 );
|
||||
gf_sqr (L0, L1 );
|
||||
gf_mul (L2, x, L0 );
|
||||
gf_sqrn (L0, L2, 18 );
|
||||
gf_mul (L2, L1, L0 );
|
||||
gf_sqrn (L0, L2, 37 );
|
||||
gf_mul (L1, L2, L0 );
|
||||
gf_sqrn (L0, L1, 37 );
|
||||
gf_mul (L1, L2, L0 );
|
||||
gf_sqrn (L0, L1, 111 );
|
||||
gf_mul (L2, L1, L0 );
|
||||
gf_sqr (L0, L2 );
|
||||
gf_mul (L1, x, L0 );
|
||||
gf_sqrn (L0, L1, 223 );
|
||||
gf_mul (L1, L2, L0 );
|
||||
gf_sqr (L2, L1);
|
||||
gf_mul (L0, L2, x);
|
||||
gf_copy(a,L1);
|
||||
return gf_eq(L0,ONE);
|
||||
gf_sqr(L1, x);
|
||||
gf_mul(L2, x, L1);
|
||||
gf_sqr(L1, L2);
|
||||
gf_mul(L2, x, L1);
|
||||
gf_sqrn(L1, L2, 3);
|
||||
gf_mul(L0, L2, L1);
|
||||
gf_sqrn(L1, L0, 3);
|
||||
gf_mul(L0, L2, L1);
|
||||
gf_sqrn(L2, L0, 9);
|
||||
gf_mul(L1, L0, L2);
|
||||
gf_sqr(L0, L1);
|
||||
gf_mul(L2, x, L0);
|
||||
gf_sqrn(L0, L2, 18);
|
||||
gf_mul(L2, L1, L0);
|
||||
gf_sqrn(L0, L2, 37);
|
||||
gf_mul(L1, L2, L0);
|
||||
gf_sqrn(L0, L1, 37);
|
||||
gf_mul(L1, L2, L0);
|
||||
gf_sqrn(L0, L1, 111);
|
||||
gf_mul(L2, L1, L0);
|
||||
gf_sqr(L0, L2);
|
||||
gf_mul(L1, x, L0);
|
||||
gf_sqrn(L0, L1, 223);
|
||||
gf_mul(L1, L2, L0);
|
||||
gf_sqr(L2, L1);
|
||||
gf_mul(L0, L2, x);
|
||||
gf_copy(a, L1);
|
||||
return gf_eq(L0, ONE);
|
||||
}
|
||||
|
@ -11,91 +11,97 @@
|
||||
*/
|
||||
|
||||
#ifndef __P448_F_FIELD_H__
|
||||
#define __P448_F_FIELD_H__ 1
|
||||
# define __P448_F_FIELD_H__ 1
|
||||
|
||||
#include "constant_time.h"
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
# include "constant_time.h"
|
||||
# include <string.h>
|
||||
# include <assert.h>
|
||||
|
||||
#include "word.h"
|
||||
# include "word.h"
|
||||
|
||||
#define __DECAF_448_GF_DEFINED__ 1
|
||||
#define NLIMBS (64/sizeof(word_t))
|
||||
#define X_SER_BYTES 56
|
||||
#define SER_BYTES 56
|
||||
# define __DECAF_448_GF_DEFINED__ 1
|
||||
# define NLIMBS (64/sizeof(word_t))
|
||||
# define X_SER_BYTES 56
|
||||
# define SER_BYTES 56
|
||||
typedef struct gf_448_s {
|
||||
word_t limb[NLIMBS];
|
||||
} __attribute__((aligned(32))) gf_448_s, gf_448_t[1];
|
||||
} __attribute__ ((aligned(32))) gf_448_s, gf_448_t[1];
|
||||
|
||||
#define GF_LIT_LIMB_BITS 56
|
||||
#define GF_BITS 448
|
||||
#define ZERO gf_448_ZERO
|
||||
#define ONE gf_448_ONE
|
||||
#define MODULUS gf_448_MODULUS
|
||||
#define gf gf_448_t
|
||||
#define gf_s gf_448_s
|
||||
#define gf_eq gf_448_eq
|
||||
#define gf_hibit gf_448_hibit
|
||||
#define gf_lobit gf_448_lobit
|
||||
#define gf_copy gf_448_copy
|
||||
#define gf_add gf_448_add
|
||||
#define gf_sub gf_448_sub
|
||||
#define gf_add_RAW gf_448_add_RAW
|
||||
#define gf_sub_RAW gf_448_sub_RAW
|
||||
#define gf_bias gf_448_bias
|
||||
#define gf_weak_reduce gf_448_weak_reduce
|
||||
#define gf_strong_reduce gf_448_strong_reduce
|
||||
#define gf_mul gf_448_mul
|
||||
#define gf_sqr gf_448_sqr
|
||||
#define gf_mulw_unsigned gf_448_mulw_unsigned
|
||||
#define gf_isr gf_448_isr
|
||||
#define gf_serialize gf_448_serialize
|
||||
#define gf_deserialize gf_448_deserialize
|
||||
# define GF_LIT_LIMB_BITS 56
|
||||
# define GF_BITS 448
|
||||
# define ZERO gf_448_ZERO
|
||||
# define ONE gf_448_ONE
|
||||
# define MODULUS gf_448_MODULUS
|
||||
# define gf gf_448_t
|
||||
# define gf_s gf_448_s
|
||||
# define gf_eq gf_448_eq
|
||||
# define gf_hibit gf_448_hibit
|
||||
# define gf_lobit gf_448_lobit
|
||||
# define gf_copy gf_448_copy
|
||||
# define gf_add gf_448_add
|
||||
# define gf_sub gf_448_sub
|
||||
# define gf_add_RAW gf_448_add_RAW
|
||||
# define gf_sub_RAW gf_448_sub_RAW
|
||||
# define gf_bias gf_448_bias
|
||||
# define gf_weak_reduce gf_448_weak_reduce
|
||||
# define gf_strong_reduce gf_448_strong_reduce
|
||||
# define gf_mul gf_448_mul
|
||||
# define gf_sqr gf_448_sqr
|
||||
# define gf_mulw_unsigned gf_448_mulw_unsigned
|
||||
# define gf_isr gf_448_isr
|
||||
# define gf_serialize gf_448_serialize
|
||||
# define gf_deserialize gf_448_deserialize
|
||||
|
||||
/* RFC 7748 support */
|
||||
#define X_PUBLIC_BYTES X_SER_BYTES
|
||||
#define X_PRIVATE_BYTES X_PUBLIC_BYTES
|
||||
#define X_PRIVATE_BITS 448
|
||||
# define X_PUBLIC_BYTES X_SER_BYTES
|
||||
# define X_PRIVATE_BYTES X_PUBLIC_BYTES
|
||||
# define X_PRIVATE_BITS 448
|
||||
|
||||
#define INLINE_UNUSED __inline__ __attribute__((unused,always_inline))
|
||||
# define INLINE_UNUSED __inline__ __attribute__((unused,always_inline))
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* Defined below in f_impl.h */
|
||||
static INLINE_UNUSED void gf_copy (gf out, const gf a) { *out = *a; }
|
||||
static INLINE_UNUSED void gf_add_RAW (gf out, const gf a, const gf b);
|
||||
static INLINE_UNUSED void gf_sub_RAW (gf out, const gf a, const gf b);
|
||||
static INLINE_UNUSED void gf_bias (gf inout, int amount);
|
||||
static INLINE_UNUSED void gf_weak_reduce (gf inout);
|
||||
static INLINE_UNUSED void gf_copy(gf out, const gf a)
|
||||
{
|
||||
*out = *a;
|
||||
}
|
||||
|
||||
void gf_strong_reduce (gf inout);
|
||||
void gf_add (gf out, const gf a, const gf b);
|
||||
void gf_sub (gf out, const gf a, const gf b);
|
||||
void gf_mul (gf_s *__restrict__ out, const gf a, const gf b);
|
||||
void gf_mulw_unsigned (gf_s *__restrict__ out, const gf a, uint32_t b);
|
||||
void gf_sqr (gf_s *__restrict__ out, const gf a);
|
||||
static INLINE_UNUSED void gf_add_RAW(gf out, const gf a, const gf b);
|
||||
static INLINE_UNUSED void gf_sub_RAW(gf out, const gf a, const gf b);
|
||||
static INLINE_UNUSED void gf_bias(gf inout, int amount);
|
||||
static INLINE_UNUSED void gf_weak_reduce(gf inout);
|
||||
|
||||
void gf_strong_reduce(gf inout);
|
||||
void gf_add(gf out, const gf a, const gf b);
|
||||
void gf_sub(gf out, const gf a, const gf b);
|
||||
void gf_mul(gf_s * __restrict__ out, const gf a, const gf b);
|
||||
void gf_mulw_unsigned(gf_s * __restrict__ out, const gf a, uint32_t b);
|
||||
void gf_sqr(gf_s * __restrict__ out, const gf a);
|
||||
mask_t gf_isr(gf a, const gf x); /** a^2 x = 1, QNR, or 0 if x=0. Return true if successful */
|
||||
mask_t gf_eq (const gf x, const gf y);
|
||||
mask_t gf_lobit (const gf x);
|
||||
mask_t gf_hibit (const gf x);
|
||||
mask_t gf_eq(const gf x, const gf y);
|
||||
mask_t gf_lobit(const gf x);
|
||||
mask_t gf_hibit(const gf x);
|
||||
|
||||
void gf_serialize (uint8_t *serial, const gf x,int with_highbit);
|
||||
mask_t gf_deserialize (gf x, const uint8_t serial[SER_BYTES],int with_hibit,uint8_t hi_nmask);
|
||||
void gf_serialize(uint8_t *serial, const gf x, int with_highbit);
|
||||
mask_t gf_deserialize(gf x, const uint8_t serial[SER_BYTES], int with_hibit,
|
||||
uint8_t hi_nmask);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#include "f_impl.h" /* Bring in the inline implementations */
|
||||
# include "f_impl.h" /* Bring in the inline implementations */
|
||||
|
||||
#ifndef LIMBPERM
|
||||
#define LIMBPERM(i) (i)
|
||||
#endif
|
||||
#define LIMB_MASK(i) (((1)<<LIMB_PLACE_VALUE(i))-1)
|
||||
# ifndef LIMBPERM
|
||||
# define LIMBPERM(i) (i)
|
||||
# endif
|
||||
# define LIMB_MASK(i) (((1)<<LIMB_PLACE_VALUE(i))-1)
|
||||
|
||||
static const gf ZERO = {{{0}}}, ONE = {{{1}}};
|
||||
static const gf ZERO = { {{0}} }, ONE = { { {
|
||||
1}}};
|
||||
|
||||
#endif /* __P448_F_FIELD_H__ */
|
||||
#endif /* __P448_F_FIELD_H__ */
|
||||
|
@ -11,24 +11,29 @@
|
||||
*/
|
||||
#include "field.h"
|
||||
|
||||
static const gf MODULUS = {FIELD_LITERAL(
|
||||
0xffffffffffffff, 0xffffffffffffff, 0xffffffffffffff, 0xffffffffffffff, 0xfffffffffffffe, 0xffffffffffffff, 0xffffffffffffff, 0xffffffffffffff
|
||||
)};
|
||||
static const gf MODULUS =
|
||||
{ FIELD_LITERAL(0xffffffffffffff, 0xffffffffffffff, 0xffffffffffffff,
|
||||
0xffffffffffffff, 0xfffffffffffffe, 0xffffffffffffff,
|
||||
0xffffffffffffff, 0xffffffffffffff)
|
||||
};
|
||||
|
||||
/** Serialize to wire format. */
|
||||
void gf_serialize (uint8_t serial[SER_BYTES], const gf x, int with_hibit) {
|
||||
unsigned int j=0, fill=0;
|
||||
void gf_serialize(uint8_t serial[SER_BYTES], const gf x, int with_hibit)
|
||||
{
|
||||
unsigned int j = 0, fill = 0;
|
||||
dword_t buffer = 0;
|
||||
unsigned int i;
|
||||
gf red;
|
||||
|
||||
gf_copy(red, x);
|
||||
gf_strong_reduce(red);
|
||||
if (!with_hibit) { assert(gf_hibit(red) == 0); }
|
||||
if (!with_hibit) {
|
||||
assert(gf_hibit(red) == 0);
|
||||
}
|
||||
|
||||
UNROLL for (i=0; i<(with_hibit ? X_SER_BYTES : SER_BYTES); i++) {
|
||||
UNROLL for (i = 0; i < (with_hibit ? X_SER_BYTES : SER_BYTES); i++) {
|
||||
if (fill < 8 && j < NLIMBS) {
|
||||
buffer |= ((dword_t)red->limb[LIMBPERM(j)]) << fill;
|
||||
buffer |= ((dword_t) red->limb[LIMBPERM(j)]) << fill;
|
||||
fill += LIMB_PLACE_VALUE(LIMBPERM(j));
|
||||
j++;
|
||||
}
|
||||
@ -39,78 +44,90 @@ void gf_serialize (uint8_t serial[SER_BYTES], const gf x, int with_hibit) {
|
||||
}
|
||||
|
||||
/** Return high bit of x = low bit of 2x mod p */
|
||||
mask_t gf_hibit(const gf x) {
|
||||
mask_t gf_hibit(const gf x)
|
||||
{
|
||||
gf y;
|
||||
gf_add(y,x,x);
|
||||
gf_add(y, x, x);
|
||||
gf_strong_reduce(y);
|
||||
return -(y->limb[0]&1);
|
||||
return -(y->limb[0] & 1);
|
||||
}
|
||||
|
||||
/** Return high bit of x = low bit of 2x mod p */
|
||||
mask_t gf_lobit(const gf x) {
|
||||
mask_t gf_lobit(const gf x)
|
||||
{
|
||||
gf y;
|
||||
gf_copy(y,x);
|
||||
gf_copy(y, x);
|
||||
gf_strong_reduce(y);
|
||||
return -(y->limb[0]&1);
|
||||
return -(y->limb[0] & 1);
|
||||
}
|
||||
|
||||
/** Deserialize from wire format; return -1 on success and 0 on failure. */
|
||||
mask_t gf_deserialize (gf x, const uint8_t serial[SER_BYTES], int with_hibit, uint8_t hi_nmask) {
|
||||
unsigned int j=0, fill=0;
|
||||
mask_t gf_deserialize(gf x, const uint8_t serial[SER_BYTES], int with_hibit,
|
||||
uint8_t hi_nmask)
|
||||
{
|
||||
unsigned int j = 0, fill = 0;
|
||||
dword_t buffer = 0;
|
||||
dsword_t scarry = 0;
|
||||
const unsigned nbytes = with_hibit ? X_SER_BYTES : SER_BYTES;
|
||||
unsigned int i;
|
||||
mask_t succ;
|
||||
|
||||
UNROLL for (i=0; i<NLIMBS; i++) {
|
||||
UNROLL for (i = 0; i < NLIMBS; i++) {
|
||||
UNROLL while (fill < LIMB_PLACE_VALUE(LIMBPERM(i)) && j < nbytes) {
|
||||
uint8_t sj = serial[j];
|
||||
if (j==nbytes-1) sj &= ~hi_nmask;
|
||||
buffer |= ((dword_t)sj) << fill;
|
||||
if (j == nbytes - 1)
|
||||
sj &= ~hi_nmask;
|
||||
buffer |= ((dword_t) sj) << fill;
|
||||
fill += 8;
|
||||
j++;
|
||||
}
|
||||
x->limb[LIMBPERM(i)] = (i<NLIMBS-1) ? buffer & LIMB_MASK(LIMBPERM(i)) : buffer;
|
||||
x->limb[LIMBPERM(i)] =
|
||||
(i < NLIMBS - 1) ? buffer & LIMB_MASK(LIMBPERM(i)) : buffer;
|
||||
fill -= LIMB_PLACE_VALUE(LIMBPERM(i));
|
||||
buffer >>= LIMB_PLACE_VALUE(LIMBPERM(i));
|
||||
scarry = (scarry + x->limb[LIMBPERM(i)] - MODULUS->limb[LIMBPERM(i)]) >> (8*sizeof(word_t));
|
||||
scarry =
|
||||
(scarry + x->limb[LIMBPERM(i)] -
|
||||
MODULUS->limb[LIMBPERM(i)]) >> (8 * sizeof(word_t));
|
||||
}
|
||||
succ = with_hibit ? -(mask_t)1 : ~gf_hibit(x);
|
||||
succ = with_hibit ? -(mask_t) 1 : ~gf_hibit(x);
|
||||
return succ & word_is_zero(buffer) & ~word_is_zero(scarry);
|
||||
}
|
||||
|
||||
/** Reduce to canonical form. */
|
||||
void gf_strong_reduce (gf a) {
|
||||
void gf_strong_reduce(gf a)
|
||||
{
|
||||
dsword_t scarry;
|
||||
word_t scarry_0;
|
||||
dword_t carry = 0;
|
||||
unsigned int i;
|
||||
|
||||
/* first, clear high */
|
||||
gf_weak_reduce(a); /* Determined to have negligible perf impact. */
|
||||
gf_weak_reduce(a); /* Determined to have negligible perf impact. */
|
||||
|
||||
/* now the total is less than 2p */
|
||||
|
||||
/* compute total_value - p. No need to reduce mod p. */
|
||||
scarry = 0;
|
||||
for (i=0; i<NLIMBS; i++) {
|
||||
for (i = 0; i < NLIMBS; i++) {
|
||||
scarry = scarry + a->limb[LIMBPERM(i)] - MODULUS->limb[LIMBPERM(i)];
|
||||
a->limb[LIMBPERM(i)] = scarry & LIMB_MASK(LIMBPERM(i));
|
||||
scarry >>= LIMB_PLACE_VALUE(LIMBPERM(i));
|
||||
}
|
||||
|
||||
/* uncommon case: it was >= p, so now scarry = 0 and this = x
|
||||
* common case: it was < p, so now scarry = -1 and this = x - p + 2^255
|
||||
* so let's add back in p. will carry back off the top for 2^255.
|
||||
/*
|
||||
* uncommon case: it was >= p, so now scarry = 0 and this = x common case:
|
||||
* it was < p, so now scarry = -1 and this = x - p + 2^255 so let's add
|
||||
* back in p. will carry back off the top for 2^255.
|
||||
*/
|
||||
assert(word_is_zero(scarry) | word_is_zero(scarry+1));
|
||||
assert(word_is_zero(scarry) | word_is_zero(scarry + 1));
|
||||
|
||||
scarry_0 = scarry;
|
||||
|
||||
/* add it back */
|
||||
for (i=0; i<NLIMBS; i++) {
|
||||
carry = carry + a->limb[LIMBPERM(i)] + (scarry_0 & MODULUS->limb[LIMBPERM(i)]);
|
||||
for (i = 0; i < NLIMBS; i++) {
|
||||
carry =
|
||||
carry + a->limb[LIMBPERM(i)] +
|
||||
(scarry_0 & MODULUS->limb[LIMBPERM(i)]);
|
||||
a->limb[LIMBPERM(i)] = carry & LIMB_MASK(LIMBPERM(i));
|
||||
carry >>= LIMB_PLACE_VALUE(LIMBPERM(i));
|
||||
}
|
||||
@ -119,28 +136,31 @@ void gf_strong_reduce (gf a) {
|
||||
}
|
||||
|
||||
/** Subtract two gf elements d=a-b */
|
||||
void gf_sub (gf d, const gf a, const gf b) {
|
||||
gf_sub_RAW ( d, a, b );
|
||||
gf_bias( d, 2 );
|
||||
gf_weak_reduce ( d );
|
||||
void gf_sub(gf d, const gf a, const gf b)
|
||||
{
|
||||
gf_sub_RAW(d, a, b);
|
||||
gf_bias(d, 2);
|
||||
gf_weak_reduce(d);
|
||||
}
|
||||
|
||||
/** Add two field elements d = a+b */
|
||||
void gf_add (gf d, const gf a, const gf b) {
|
||||
gf_add_RAW ( d, a, b );
|
||||
gf_weak_reduce ( d );
|
||||
void gf_add(gf d, const gf a, const gf b)
|
||||
{
|
||||
gf_add_RAW(d, a, b);
|
||||
gf_weak_reduce(d);
|
||||
}
|
||||
|
||||
/** Compare a==b */
|
||||
mask_t gf_eq(const gf a, const gf b) {
|
||||
mask_t gf_eq(const gf a, const gf b)
|
||||
{
|
||||
gf c;
|
||||
mask_t ret=0;
|
||||
mask_t ret = 0;
|
||||
unsigned int i;
|
||||
|
||||
gf_sub(c,a,b);
|
||||
gf_sub(c, a, b);
|
||||
gf_strong_reduce(c);
|
||||
|
||||
for (i=0; i<NLIMBS; i++) {
|
||||
for (i = 0; i < NLIMBS; i++) {
|
||||
ret |= c->limb[LIMBPERM(i)];
|
||||
}
|
||||
|
||||
|
@ -11,85 +11,90 @@
|
||||
*/
|
||||
|
||||
#ifndef __GF_H__
|
||||
#define __GF_H__
|
||||
# define __GF_H__
|
||||
|
||||
# include "constant_time.h"
|
||||
# include "f_field.h"
|
||||
# include <string.h>
|
||||
|
||||
#include "constant_time.h"
|
||||
#include "f_field.h"
|
||||
#include <string.h>
|
||||
|
||||
/** Square x, n times. */
|
||||
static ossl_inline void gf_sqrn (
|
||||
gf_s *__restrict__ y,
|
||||
const gf x,
|
||||
int n
|
||||
) {
|
||||
static ossl_inline void gf_sqrn(gf_s * __restrict__ y, const gf x, int n)
|
||||
{
|
||||
gf tmp;
|
||||
assert(n>0);
|
||||
if (n&1) {
|
||||
gf_sqr(y,x);
|
||||
assert(n > 0);
|
||||
if (n & 1) {
|
||||
gf_sqr(y, x);
|
||||
n--;
|
||||
} else {
|
||||
gf_sqr(tmp,x);
|
||||
gf_sqr(y,tmp);
|
||||
n-=2;
|
||||
gf_sqr(tmp, x);
|
||||
gf_sqr(y, tmp);
|
||||
n -= 2;
|
||||
}
|
||||
for (; n; n-=2) {
|
||||
gf_sqr(tmp,y);
|
||||
gf_sqr(y,tmp);
|
||||
for (; n; n -= 2) {
|
||||
gf_sqr(tmp, y);
|
||||
gf_sqr(y, tmp);
|
||||
}
|
||||
}
|
||||
|
||||
#define gf_add_nr gf_add_RAW
|
||||
# define gf_add_nr gf_add_RAW
|
||||
|
||||
/** Subtract mod p. Bias by 2 and don't reduce */
|
||||
static ossl_inline void gf_sub_nr ( gf c, const gf a, const gf b ) {
|
||||
gf_sub_RAW(c,a,b);
|
||||
static ossl_inline void gf_sub_nr(gf c, const gf a, const gf b)
|
||||
{
|
||||
gf_sub_RAW(c, a, b);
|
||||
gf_bias(c, 2);
|
||||
if (GF_HEADROOM < 3) gf_weak_reduce(c);
|
||||
if (GF_HEADROOM < 3)
|
||||
gf_weak_reduce(c);
|
||||
}
|
||||
|
||||
/** Subtract mod p. Bias by amt but don't reduce. */
|
||||
static ossl_inline void gf_subx_nr ( gf c, const gf a, const gf b, int amt ) {
|
||||
gf_sub_RAW(c,a,b);
|
||||
static ossl_inline void gf_subx_nr(gf c, const gf a, const gf b, int amt)
|
||||
{
|
||||
gf_sub_RAW(c, a, b);
|
||||
gf_bias(c, amt);
|
||||
if (GF_HEADROOM < amt+1) gf_weak_reduce(c);
|
||||
if (GF_HEADROOM < amt + 1)
|
||||
gf_weak_reduce(c);
|
||||
}
|
||||
|
||||
/** Mul by signed int. Not constant-time WRT the sign of that int. */
|
||||
static ossl_inline void gf_mulw(gf c, const gf a, int32_t w) {
|
||||
if (w>0) {
|
||||
static ossl_inline void gf_mulw(gf c, const gf a, int32_t w)
|
||||
{
|
||||
if (w > 0) {
|
||||
gf_mulw_unsigned(c, a, w);
|
||||
} else {
|
||||
gf_mulw_unsigned(c, a, -w);
|
||||
gf_sub(c,ZERO,c);
|
||||
gf_sub(c, ZERO, c);
|
||||
}
|
||||
}
|
||||
|
||||
/** Constant time, x = is_z ? z : y */
|
||||
static ossl_inline void gf_cond_sel(gf x, const gf y, const gf z, mask_t is_z) {
|
||||
constant_time_select(x,y,z,sizeof(gf),is_z,0);
|
||||
static ossl_inline void gf_cond_sel(gf x, const gf y, const gf z, mask_t is_z)
|
||||
{
|
||||
constant_time_select(x, y, z, sizeof(gf), is_z, 0);
|
||||
}
|
||||
|
||||
/** Constant time, if (neg) x=-x; */
|
||||
static ossl_inline void gf_cond_neg(gf x, mask_t neg) {
|
||||
static ossl_inline void gf_cond_neg(gf x, mask_t neg)
|
||||
{
|
||||
gf y;
|
||||
gf_sub(y,ZERO,x);
|
||||
gf_cond_sel(x,x,y,neg);
|
||||
gf_sub(y, ZERO, x);
|
||||
gf_cond_sel(x, x, y, neg);
|
||||
}
|
||||
|
||||
/** Constant time, if (swap) (x,y) = (y,x); */
|
||||
static ossl_inline void
|
||||
gf_cond_swap(gf x, gf_s *__restrict__ y, mask_t swap) {
|
||||
constant_time_cond_swap(x,y,sizeof(gf_s),swap);
|
||||
static ossl_inline void gf_cond_swap(gf x, gf_s * __restrict__ y, mask_t swap)
|
||||
{
|
||||
constant_time_cond_swap(x, y, sizeof(gf_s), swap);
|
||||
}
|
||||
|
||||
static ossl_inline void gf_mul_qnr(gf_s *__restrict__ out, const gf x) {
|
||||
gf_sub(out,ZERO,x);
|
||||
static ossl_inline void gf_mul_qnr(gf_s * __restrict__ out, const gf x)
|
||||
{
|
||||
gf_sub(out, ZERO, x);
|
||||
}
|
||||
|
||||
static ossl_inline void gf_div_qnr(gf_s *__restrict__ out, const gf x) {
|
||||
gf_sub(out,ZERO,x);
|
||||
static ossl_inline void gf_div_qnr(gf_s * __restrict__ out, const gf x)
|
||||
{
|
||||
gf_sub(out, ZERO, x);
|
||||
}
|
||||
|
||||
|
||||
#endif /* __GF_H__ */
|
||||
#endif /* __GF_H__ */
|
||||
|
@ -11,52 +11,52 @@
|
||||
*/
|
||||
|
||||
#ifndef __DECAF_POINT_448_H__
|
||||
#define __DECAF_POINT_448_H__ 1
|
||||
# define __DECAF_POINT_448_H__ 1
|
||||
|
||||
#include "curve448utils.h"
|
||||
#include "field.h"
|
||||
# include "curve448utils.h"
|
||||
# include "field.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/** @cond internal */
|
||||
#define DECAF_448_SCALAR_LIMBS ((446-1)/DECAF_WORD_BITS+1)
|
||||
# define DECAF_448_SCALAR_LIMBS ((446-1)/DECAF_WORD_BITS+1)
|
||||
/** @endcond */
|
||||
|
||||
/** The number of bits in a scalar */
|
||||
#define DECAF_448_SCALAR_BITS 446
|
||||
# define DECAF_448_SCALAR_BITS 446
|
||||
|
||||
/** Number of bytes in a serialized point. */
|
||||
#define DECAF_448_SER_BYTES 56
|
||||
# define DECAF_448_SER_BYTES 56
|
||||
|
||||
/** Number of bytes in an elligated point. For now set the same as SER_BYTES
|
||||
* but could be different for other curves.
|
||||
*/
|
||||
#define DECAF_448_HASH_BYTES 56
|
||||
# define DECAF_448_HASH_BYTES 56
|
||||
|
||||
/** Number of bytes in a serialized scalar. */
|
||||
#define DECAF_448_SCALAR_BYTES 56
|
||||
# define DECAF_448_SCALAR_BYTES 56
|
||||
|
||||
/** Number of bits in the "which" field of an elligator inverse */
|
||||
#define DECAF_448_INVERT_ELLIGATOR_WHICH_BITS 3
|
||||
# define DECAF_448_INVERT_ELLIGATOR_WHICH_BITS 3
|
||||
|
||||
/** The cofactor the curve would have, if we hadn't removed it */
|
||||
#define DECAF_448_REMOVED_COFACTOR 4
|
||||
# define DECAF_448_REMOVED_COFACTOR 4
|
||||
|
||||
/** X448 encoding ratio. */
|
||||
#define DECAF_X448_ENCODE_RATIO 2
|
||||
# define DECAF_X448_ENCODE_RATIO 2
|
||||
|
||||
/** Number of bytes in an x448 public key */
|
||||
#define DECAF_X448_PUBLIC_BYTES 56
|
||||
# define DECAF_X448_PUBLIC_BYTES 56
|
||||
|
||||
/** Number of bytes in an x448 private key */
|
||||
#define DECAF_X448_PRIVATE_BYTES 56
|
||||
# define DECAF_X448_PRIVATE_BYTES 56
|
||||
|
||||
/** Twisted Edwards extended homogeneous coordinates */
|
||||
typedef struct curve448_point_s {
|
||||
/** @cond internal */
|
||||
gf_448_t x,y,z,t;
|
||||
gf_448_t x, y, z, t;
|
||||
/** @endcond */
|
||||
} curve448_point_t[1];
|
||||
|
||||
@ -64,7 +64,7 @@ typedef struct curve448_point_s {
|
||||
struct curve448_precomputed_s;
|
||||
|
||||
/** Precomputed table based on a point. Can be trivial implementation. */
|
||||
typedef struct curve448_precomputed_s curve448_precomputed_s;
|
||||
typedef struct curve448_precomputed_s curve448_precomputed_s;
|
||||
|
||||
/** Scalar is stored packed, because we don't need the speed. */
|
||||
typedef struct curve448_scalar_s {
|
||||
@ -98,10 +98,10 @@ extern const struct curve448_precomputed_s *curve448_precomputed_base;
|
||||
* @retval DECAF_FAILURE The scalar was greater than the modulus,
|
||||
* and has been reduced modulo that modulus.
|
||||
*/
|
||||
__owur decaf_error_t curve448_scalar_decode (
|
||||
curve448_scalar_t out,
|
||||
const unsigned char ser[DECAF_448_SCALAR_BYTES]
|
||||
);
|
||||
__owur decaf_error_t curve448_scalar_decode(curve448_scalar_t out,
|
||||
const unsigned char
|
||||
ser[DECAF_448_SCALAR_BYTES]
|
||||
);
|
||||
|
||||
/**
|
||||
* @brief Read a scalar from wire format or from bytes. Reduces mod
|
||||
@ -111,68 +111,51 @@ __owur decaf_error_t curve448_scalar_decode (
|
||||
* @param [in] ser_len Length of serialized form.
|
||||
* @param [out] out Deserialized form.
|
||||
*/
|
||||
void curve448_scalar_decode_long (
|
||||
curve448_scalar_t out,
|
||||
const unsigned char *ser,
|
||||
size_t ser_len
|
||||
);
|
||||
|
||||
void curve448_scalar_decode_long(curve448_scalar_t out,
|
||||
const unsigned char *ser, size_t ser_len);
|
||||
|
||||
/**
|
||||
* @brief Serialize a scalar to wire format.
|
||||
*
|
||||
* @param [out] ser Serialized form of a scalar.
|
||||
* @param [in] s Deserialized scalar.
|
||||
*/
|
||||
void curve448_scalar_encode (
|
||||
unsigned char ser[DECAF_448_SCALAR_BYTES],
|
||||
const curve448_scalar_t s
|
||||
);
|
||||
|
||||
void curve448_scalar_encode(unsigned char ser[DECAF_448_SCALAR_BYTES],
|
||||
const curve448_scalar_t s);
|
||||
|
||||
/**
|
||||
* @brief Add two scalars. The scalars may use the same memory.
|
||||
* @param [in] a One scalar.
|
||||
* @param [in] b Another scalar.
|
||||
* @param [out] out a+b.
|
||||
*/
|
||||
void curve448_scalar_add (
|
||||
curve448_scalar_t out,
|
||||
const curve448_scalar_t a,
|
||||
const curve448_scalar_t b
|
||||
);
|
||||
void curve448_scalar_add(curve448_scalar_t out,
|
||||
const curve448_scalar_t a, const curve448_scalar_t b);
|
||||
|
||||
/**
|
||||
* @brief Subtract two scalars. The scalars may use the same memory.
|
||||
* @param [in] a One scalar.
|
||||
* @param [in] b Another scalar.
|
||||
* @param [out] out a-b.
|
||||
*/
|
||||
void curve448_scalar_sub (
|
||||
curve448_scalar_t out,
|
||||
const curve448_scalar_t a,
|
||||
const curve448_scalar_t b
|
||||
);
|
||||
*/
|
||||
void curve448_scalar_sub(curve448_scalar_t out,
|
||||
const curve448_scalar_t a, const curve448_scalar_t b);
|
||||
|
||||
/**
|
||||
* @brief Multiply two scalars. The scalars may use the same memory.
|
||||
* @param [in] a One scalar.
|
||||
* @param [in] b Another scalar.
|
||||
* @param [out] out a*b.
|
||||
*/
|
||||
void curve448_scalar_mul (
|
||||
curve448_scalar_t out,
|
||||
const curve448_scalar_t a,
|
||||
const curve448_scalar_t b
|
||||
);
|
||||
|
||||
*/
|
||||
void curve448_scalar_mul(curve448_scalar_t out,
|
||||
const curve448_scalar_t a, const curve448_scalar_t b);
|
||||
|
||||
/**
|
||||
* @brief Halve a scalar. The scalars may use the same memory.
|
||||
* @param [in] a A scalar.
|
||||
* @param [out] out a/2.
|
||||
*/
|
||||
void curve448_scalar_halve (
|
||||
curve448_scalar_t out,
|
||||
const curve448_scalar_t a
|
||||
);
|
||||
void curve448_scalar_halve(curve448_scalar_t out, const curve448_scalar_t a);
|
||||
|
||||
/**
|
||||
* @brief Copy a scalar. The scalars may use the same memory, in which
|
||||
@ -180,10 +163,9 @@ void curve448_scalar_halve (
|
||||
* @param [in] a A scalar.
|
||||
* @param [out] out Will become a copy of a.
|
||||
*/
|
||||
static ossl_inline void curve448_scalar_copy (
|
||||
curve448_scalar_t out,
|
||||
const curve448_scalar_t a
|
||||
) {
|
||||
static ossl_inline void curve448_scalar_copy(curve448_scalar_t out,
|
||||
const curve448_scalar_t a)
|
||||
{
|
||||
*out = *a;
|
||||
}
|
||||
|
||||
@ -194,11 +176,10 @@ static ossl_inline void curve448_scalar_copy (
|
||||
* @param [out] a A copy of the point.
|
||||
* @param [in] b Any point.
|
||||
*/
|
||||
static ossl_inline void curve448_point_copy (
|
||||
curve448_point_t a,
|
||||
const curve448_point_t b
|
||||
) {
|
||||
*a=*b;
|
||||
static ossl_inline void curve448_point_copy(curve448_point_t a,
|
||||
const curve448_point_t b)
|
||||
{
|
||||
*a = *b;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -210,10 +191,8 @@ static ossl_inline void curve448_point_copy (
|
||||
* @retval DECAF_TRUE The points are equal.
|
||||
* @retval DECAF_FALSE The points are not equal.
|
||||
*/
|
||||
__owur decaf_bool_t curve448_point_eq (
|
||||
const curve448_point_t a,
|
||||
const curve448_point_t b
|
||||
);
|
||||
__owur decaf_bool_t curve448_point_eq(const curve448_point_t a,
|
||||
const curve448_point_t b);
|
||||
|
||||
/**
|
||||
* @brief Double a point. Equivalent to
|
||||
@ -222,10 +201,7 @@ __owur decaf_bool_t curve448_point_eq (
|
||||
* @param [out] two_a The sum a+a.
|
||||
* @param [in] a A point.
|
||||
*/
|
||||
void curve448_point_double (
|
||||
curve448_point_t two_a,
|
||||
const curve448_point_t a
|
||||
);
|
||||
void curve448_point_double(curve448_point_t two_a, const curve448_point_t a);
|
||||
|
||||
/**
|
||||
* @brief RFC 7748 Diffie-Hellman scalarmul. This function uses a different
|
||||
@ -239,11 +215,10 @@ void curve448_point_double (
|
||||
* @retval DECAF_FAILURE The scalarmul didn't succeed, because the base
|
||||
* point is in a small subgroup.
|
||||
*/
|
||||
__owur decaf_error_t decaf_x448 (
|
||||
uint8_t out[DECAF_X448_PUBLIC_BYTES],
|
||||
const uint8_t base[DECAF_X448_PUBLIC_BYTES],
|
||||
const uint8_t scalar[DECAF_X448_PRIVATE_BYTES]
|
||||
);
|
||||
__owur decaf_error_t decaf_x448(uint8_t out[DECAF_X448_PUBLIC_BYTES],
|
||||
const uint8_t base[DECAF_X448_PUBLIC_BYTES],
|
||||
const uint8_t scalar[DECAF_X448_PRIVATE_BYTES]
|
||||
);
|
||||
|
||||
/**
|
||||
* @brief Multiply a point by DECAF_X448_ENCODE_RATIO,
|
||||
@ -265,14 +240,14 @@ __owur decaf_error_t decaf_x448 (
|
||||
* @param [out] out The scaled and encoded point.
|
||||
* @param [in] p The point to be scaled and encoded.
|
||||
*/
|
||||
void curve448_point_mul_by_ratio_and_encode_like_x448 (
|
||||
uint8_t out[DECAF_X448_PUBLIC_BYTES],
|
||||
const curve448_point_t p
|
||||
);
|
||||
void curve448_point_mul_by_ratio_and_encode_like_x448(uint8_t
|
||||
out
|
||||
[DECAF_X448_PUBLIC_BYTES],
|
||||
const curve448_point_t p);
|
||||
|
||||
/** The base point for X448 Diffie-Hellman */
|
||||
extern const uint8_t decaf_x448_base_point[DECAF_X448_PUBLIC_BYTES];
|
||||
|
||||
|
||||
/**
|
||||
* @brief RFC 7748 Diffie-Hellman base point scalarmul. This function uses
|
||||
* a different (non-Decaf) encoding.
|
||||
@ -283,11 +258,9 @@ extern const uint8_t decaf_x448_base_point[DECAF_X448_PUBLIC_BYTES];
|
||||
* @param [out] scaled The scaled point base*scalar
|
||||
* @param [in] scalar The scalar to multiply by.
|
||||
*/
|
||||
void decaf_x448_derive_public_key (
|
||||
uint8_t out[DECAF_X448_PUBLIC_BYTES],
|
||||
const uint8_t scalar[DECAF_X448_PRIVATE_BYTES]
|
||||
);
|
||||
|
||||
void decaf_x448_derive_public_key(uint8_t out[DECAF_X448_PUBLIC_BYTES],
|
||||
const uint8_t scalar[DECAF_X448_PRIVATE_BYTES]
|
||||
);
|
||||
|
||||
/**
|
||||
* @brief Multiply a precomputed base point by a scalar:
|
||||
@ -300,12 +273,9 @@ void decaf_x448_derive_public_key (
|
||||
* @param [in] base The point to be scaled.
|
||||
* @param [in] scalar The scalar to multiply by.
|
||||
*/
|
||||
void curve448_precomputed_scalarmul (
|
||||
curve448_point_t scaled,
|
||||
const curve448_precomputed_s *base,
|
||||
const curve448_scalar_t scalar
|
||||
);
|
||||
|
||||
void curve448_precomputed_scalarmul(curve448_point_t scaled,
|
||||
const curve448_precomputed_s * base,
|
||||
const curve448_scalar_t scalar);
|
||||
|
||||
/**
|
||||
* @brief Multiply two base points by two scalars:
|
||||
@ -322,12 +292,10 @@ void curve448_precomputed_scalarmul (
|
||||
* @warning: This function takes variable time, and may leak the scalars
|
||||
* used. It is designed for signature verification.
|
||||
*/
|
||||
void curve448_base_double_scalarmul_non_secret (
|
||||
curve448_point_t combo,
|
||||
const curve448_scalar_t scalar1,
|
||||
const curve448_point_t base2,
|
||||
const curve448_scalar_t scalar2
|
||||
);
|
||||
void curve448_base_double_scalarmul_non_secret(curve448_point_t combo,
|
||||
const curve448_scalar_t scalar1,
|
||||
const curve448_point_t base2,
|
||||
const curve448_scalar_t scalar2);
|
||||
|
||||
/**
|
||||
* @brief Test that a point is valid, for debugging purposes.
|
||||
@ -336,26 +304,20 @@ void curve448_base_double_scalarmul_non_secret (
|
||||
* @retval DECAF_TRUE The point is valid.
|
||||
* @retval DECAF_FALSE The point is invalid.
|
||||
*/
|
||||
__owur decaf_bool_t curve448_point_valid (
|
||||
const curve448_point_t to_test
|
||||
);
|
||||
__owur decaf_bool_t curve448_point_valid(const curve448_point_t to_test);
|
||||
|
||||
/**
|
||||
* @brief Overwrite scalar with zeros.
|
||||
*/
|
||||
void curve448_scalar_destroy (
|
||||
curve448_scalar_t scalar
|
||||
);
|
||||
void curve448_scalar_destroy(curve448_scalar_t scalar);
|
||||
|
||||
/**
|
||||
* @brief Overwrite point with zeros.
|
||||
*/
|
||||
void curve448_point_destroy (
|
||||
curve448_point_t point
|
||||
);
|
||||
void curve448_point_destroy(curve448_point_t point);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif /* __DECAF_POINT_448_H__ */
|
||||
#endif /* __DECAF_POINT_448_H__ */
|
||||
|
@ -15,110 +15,114 @@
|
||||
#include "constant_time.h"
|
||||
#include "point_448.h"
|
||||
|
||||
static const decaf_word_t MONTGOMERY_FACTOR = (decaf_word_t)0x3bd440fae918bc5;
|
||||
static const curve448_scalar_t sc_p = {{{
|
||||
SC_LIMB(0x2378c292ab5844f3), SC_LIMB(0x216cc2728dc58f55), SC_LIMB(0xc44edb49aed63690), SC_LIMB(0xffffffff7cca23e9), SC_LIMB(0xffffffffffffffff), SC_LIMB(0xffffffffffffffff), SC_LIMB(0x3fffffffffffffff)
|
||||
}}}, sc_r2 = {{{
|
||||
SC_LIMB(0xe3539257049b9b60), SC_LIMB(0x7af32c4bc1b195d9), SC_LIMB(0x0d66de2388ea1859), SC_LIMB(0xae17cf725ee4d838), SC_LIMB(0x1a9cc14ba3c47c44), SC_LIMB(0x2052bcb7e4d070af), SC_LIMB(0x3402a939f823b729)
|
||||
static const decaf_word_t MONTGOMERY_FACTOR = (decaf_word_t) 0x3bd440fae918bc5;
|
||||
static const curve448_scalar_t sc_p = { {{
|
||||
SC_LIMB(0x2378c292ab5844f3),
|
||||
SC_LIMB(0x216cc2728dc58f55),
|
||||
SC_LIMB(0xc44edb49aed63690),
|
||||
SC_LIMB(0xffffffff7cca23e9),
|
||||
SC_LIMB(0xffffffffffffffff),
|
||||
SC_LIMB(0xffffffffffffffff),
|
||||
SC_LIMB(0x3fffffffffffffff)
|
||||
}}
|
||||
}, sc_r2 = { { {
|
||||
|
||||
SC_LIMB(0xe3539257049b9b60), SC_LIMB(0x7af32c4bc1b195d9),
|
||||
SC_LIMB(0x0d66de2388ea1859), SC_LIMB(0xae17cf725ee4d838),
|
||||
SC_LIMB(0x1a9cc14ba3c47c44), SC_LIMB(0x2052bcb7e4d070af),
|
||||
SC_LIMB(0x3402a939f823b729)
|
||||
}}};
|
||||
|
||||
/* End of template stuff */
|
||||
|
||||
#define WBITS DECAF_WORD_BITS /* NB this may be different from ARCH_WORD_BITS */
|
||||
#define WBITS DECAF_WORD_BITS /* NB this may be different from ARCH_WORD_BITS */
|
||||
|
||||
const curve448_scalar_t curve448_scalar_one = {{{1}}}, curve448_scalar_zero = {{{0}}};
|
||||
const curve448_scalar_t curve448_scalar_one = { {{1}} }, curve448_scalar_zero = { { {
|
||||
0}}};
|
||||
|
||||
/** {extra,accum} - sub +? p
|
||||
* Must have extra <= 1
|
||||
*/
|
||||
static void sc_subx(
|
||||
curve448_scalar_t out,
|
||||
const decaf_word_t accum[DECAF_448_SCALAR_LIMBS],
|
||||
const curve448_scalar_t sub,
|
||||
const curve448_scalar_t p,
|
||||
decaf_word_t extra
|
||||
) {
|
||||
static void sc_subx(curve448_scalar_t out,
|
||||
const decaf_word_t accum[DECAF_448_SCALAR_LIMBS],
|
||||
const curve448_scalar_t sub,
|
||||
const curve448_scalar_t p, decaf_word_t extra)
|
||||
{
|
||||
decaf_dsword_t chain = 0;
|
||||
unsigned int i;
|
||||
decaf_word_t borrow;
|
||||
|
||||
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) {
|
||||
for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
|
||||
chain = (chain + accum[i]) - sub->limb[i];
|
||||
out->limb[i] = chain;
|
||||
chain >>= WBITS;
|
||||
}
|
||||
borrow = chain+extra; /* = 0 or -1 */
|
||||
|
||||
borrow = chain + extra; /* = 0 or -1 */
|
||||
|
||||
chain = 0;
|
||||
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) {
|
||||
for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
|
||||
chain = (chain + out->limb[i]) + (p->limb[i] & borrow);
|
||||
out->limb[i] = chain;
|
||||
chain >>= WBITS;
|
||||
}
|
||||
}
|
||||
|
||||
static void sc_montmul (
|
||||
curve448_scalar_t out,
|
||||
const curve448_scalar_t a,
|
||||
const curve448_scalar_t b
|
||||
) {
|
||||
unsigned int i,j;
|
||||
decaf_word_t accum[DECAF_448_SCALAR_LIMBS+1] = {0};
|
||||
static void sc_montmul(curve448_scalar_t out,
|
||||
const curve448_scalar_t a, const curve448_scalar_t b)
|
||||
{
|
||||
unsigned int i, j;
|
||||
decaf_word_t accum[DECAF_448_SCALAR_LIMBS + 1] = { 0 };
|
||||
decaf_word_t hi_carry = 0;
|
||||
|
||||
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) {
|
||||
|
||||
for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
|
||||
decaf_word_t mand = a->limb[i];
|
||||
const decaf_word_t *mier = b->limb;
|
||||
|
||||
|
||||
decaf_dword_t chain = 0;
|
||||
for (j=0; j<DECAF_448_SCALAR_LIMBS; j++) {
|
||||
chain += ((decaf_dword_t)mand)*mier[j] + accum[j];
|
||||
for (j = 0; j < DECAF_448_SCALAR_LIMBS; j++) {
|
||||
chain += ((decaf_dword_t) mand) * mier[j] + accum[j];
|
||||
accum[j] = chain;
|
||||
chain >>= WBITS;
|
||||
}
|
||||
accum[j] = chain;
|
||||
|
||||
|
||||
mand = accum[0] * MONTGOMERY_FACTOR;
|
||||
chain = 0;
|
||||
mier = sc_p->limb;
|
||||
for (j=0; j<DECAF_448_SCALAR_LIMBS; j++) {
|
||||
chain += (decaf_dword_t)mand*mier[j] + accum[j];
|
||||
if (j) accum[j-1] = chain;
|
||||
for (j = 0; j < DECAF_448_SCALAR_LIMBS; j++) {
|
||||
chain += (decaf_dword_t) mand *mier[j] + accum[j];
|
||||
if (j)
|
||||
accum[j - 1] = chain;
|
||||
chain >>= WBITS;
|
||||
}
|
||||
chain += accum[j];
|
||||
chain += hi_carry;
|
||||
accum[j-1] = chain;
|
||||
accum[j - 1] = chain;
|
||||
hi_carry = chain >> WBITS;
|
||||
}
|
||||
|
||||
|
||||
sc_subx(out, accum, sc_p, sc_p, hi_carry);
|
||||
}
|
||||
|
||||
void curve448_scalar_mul (
|
||||
curve448_scalar_t out,
|
||||
const curve448_scalar_t a,
|
||||
const curve448_scalar_t b
|
||||
) {
|
||||
sc_montmul(out,a,b);
|
||||
sc_montmul(out,out,sc_r2);
|
||||
void curve448_scalar_mul(curve448_scalar_t out,
|
||||
const curve448_scalar_t a, const curve448_scalar_t b)
|
||||
{
|
||||
sc_montmul(out, a, b);
|
||||
sc_montmul(out, out, sc_r2);
|
||||
}
|
||||
|
||||
void curve448_scalar_sub (
|
||||
curve448_scalar_t out,
|
||||
const curve448_scalar_t a,
|
||||
const curve448_scalar_t b
|
||||
) {
|
||||
void curve448_scalar_sub(curve448_scalar_t out,
|
||||
const curve448_scalar_t a, const curve448_scalar_t b)
|
||||
{
|
||||
sc_subx(out, a->limb, b, sc_p, 0);
|
||||
}
|
||||
|
||||
void curve448_scalar_add (
|
||||
curve448_scalar_t out,
|
||||
const curve448_scalar_t a,
|
||||
const curve448_scalar_t b
|
||||
) {
|
||||
void curve448_scalar_add(curve448_scalar_t out,
|
||||
const curve448_scalar_t a, const curve448_scalar_t b)
|
||||
{
|
||||
decaf_dword_t chain = 0;
|
||||
unsigned int i;
|
||||
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) {
|
||||
for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
|
||||
chain = (chain + a->limb[i]) + b->limb[i];
|
||||
out->limb[i] = chain;
|
||||
chain >>= WBITS;
|
||||
@ -126,50 +130,47 @@ void curve448_scalar_add (
|
||||
sc_subx(out, out->limb, sc_p, sc_p, chain);
|
||||
}
|
||||
|
||||
static ossl_inline void scalar_decode_short (
|
||||
curve448_scalar_t s,
|
||||
const unsigned char *ser,
|
||||
unsigned int nbytes
|
||||
) {
|
||||
unsigned int i,j,k=0;
|
||||
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) {
|
||||
static ossl_inline void scalar_decode_short(curve448_scalar_t s,
|
||||
const unsigned char *ser,
|
||||
unsigned int nbytes)
|
||||
{
|
||||
unsigned int i, j, k = 0;
|
||||
for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
|
||||
decaf_word_t out = 0;
|
||||
for (j=0; j<sizeof(decaf_word_t) && k<nbytes; j++,k++) {
|
||||
out |= ((decaf_word_t)ser[k])<<(8*j);
|
||||
for (j = 0; j < sizeof(decaf_word_t) && k < nbytes; j++, k++) {
|
||||
out |= ((decaf_word_t) ser[k]) << (8 * j);
|
||||
}
|
||||
s->limb[i] = out;
|
||||
}
|
||||
}
|
||||
|
||||
decaf_error_t curve448_scalar_decode(
|
||||
curve448_scalar_t s,
|
||||
const unsigned char ser[DECAF_448_SCALAR_BYTES]
|
||||
) {
|
||||
decaf_error_t curve448_scalar_decode(curve448_scalar_t s,
|
||||
const unsigned char
|
||||
ser[DECAF_448_SCALAR_BYTES]
|
||||
)
|
||||
{
|
||||
unsigned int i;
|
||||
decaf_dsword_t accum = 0;
|
||||
|
||||
scalar_decode_short(s, ser, DECAF_448_SCALAR_BYTES);
|
||||
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) {
|
||||
for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
|
||||
accum = (accum + s->limb[i] - sc_p->limb[i]) >> WBITS;
|
||||
}
|
||||
/* Here accum == 0 or -1 */
|
||||
|
||||
curve448_scalar_mul(s,s,curve448_scalar_one); /* ham-handed reduce */
|
||||
|
||||
|
||||
curve448_scalar_mul(s, s, curve448_scalar_one); /* ham-handed reduce */
|
||||
|
||||
return decaf_succeed_if(~word_is_zero(accum));
|
||||
}
|
||||
|
||||
void curve448_scalar_destroy (
|
||||
curve448_scalar_t scalar
|
||||
) {
|
||||
void curve448_scalar_destroy(curve448_scalar_t scalar)
|
||||
{
|
||||
OPENSSL_cleanse(scalar, sizeof(curve448_scalar_t));
|
||||
}
|
||||
|
||||
void curve448_scalar_decode_long(
|
||||
curve448_scalar_t s,
|
||||
const unsigned char *ser,
|
||||
size_t ser_len
|
||||
) {
|
||||
void curve448_scalar_decode_long(curve448_scalar_t s,
|
||||
const unsigned char *ser, size_t ser_len)
|
||||
{
|
||||
size_t i;
|
||||
curve448_scalar_t t1, t2;
|
||||
|
||||
@ -178,23 +179,24 @@ void curve448_scalar_decode_long(
|
||||
return;
|
||||
}
|
||||
|
||||
i = ser_len - (ser_len%DECAF_448_SCALAR_BYTES);
|
||||
if (i==ser_len) i -= DECAF_448_SCALAR_BYTES;
|
||||
|
||||
scalar_decode_short(t1, &ser[i], ser_len-i);
|
||||
i = ser_len - (ser_len % DECAF_448_SCALAR_BYTES);
|
||||
if (i == ser_len)
|
||||
i -= DECAF_448_SCALAR_BYTES;
|
||||
|
||||
scalar_decode_short(t1, &ser[i], ser_len - i);
|
||||
|
||||
if (ser_len == sizeof(curve448_scalar_t)) {
|
||||
assert(i==0);
|
||||
assert(i == 0);
|
||||
/* ham-handed reduce */
|
||||
curve448_scalar_mul(s,t1,curve448_scalar_one);
|
||||
curve448_scalar_mul(s, t1, curve448_scalar_one);
|
||||
curve448_scalar_destroy(t1);
|
||||
return;
|
||||
}
|
||||
|
||||
while (i) {
|
||||
i -= DECAF_448_SCALAR_BYTES;
|
||||
sc_montmul(t1,t1,sc_r2);
|
||||
ignore_result( curve448_scalar_decode(t2, ser+i) );
|
||||
sc_montmul(t1, t1, sc_r2);
|
||||
ignore_result(curve448_scalar_decode(t2, ser + i));
|
||||
curve448_scalar_add(t1, t1, t2);
|
||||
}
|
||||
|
||||
@ -203,33 +205,29 @@ void curve448_scalar_decode_long(
|
||||
curve448_scalar_destroy(t2);
|
||||
}
|
||||
|
||||
void curve448_scalar_encode(
|
||||
unsigned char ser[DECAF_448_SCALAR_BYTES],
|
||||
const curve448_scalar_t s
|
||||
) {
|
||||
unsigned int i,j,k=0;
|
||||
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) {
|
||||
for (j=0; j<sizeof(decaf_word_t); j++,k++) {
|
||||
ser[k] = s->limb[i] >> (8*j);
|
||||
void curve448_scalar_encode(unsigned char ser[DECAF_448_SCALAR_BYTES],
|
||||
const curve448_scalar_t s)
|
||||
{
|
||||
unsigned int i, j, k = 0;
|
||||
for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
|
||||
for (j = 0; j < sizeof(decaf_word_t); j++, k++) {
|
||||
ser[k] = s->limb[i] >> (8 * j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void curve448_scalar_halve (
|
||||
curve448_scalar_t out,
|
||||
const curve448_scalar_t a
|
||||
) {
|
||||
void curve448_scalar_halve(curve448_scalar_t out, const curve448_scalar_t a)
|
||||
{
|
||||
decaf_word_t mask = -(a->limb[0] & 1);
|
||||
decaf_dword_t chain = 0;
|
||||
unsigned int i;
|
||||
for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) {
|
||||
for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
|
||||
chain = (chain + a->limb[i]) + (sc_p->limb[i] & mask);
|
||||
out->limb[i] = chain;
|
||||
chain >>= DECAF_WORD_BITS;
|
||||
}
|
||||
for (i=0; i<DECAF_448_SCALAR_LIMBS-1; i++) {
|
||||
out->limb[i] = out->limb[i]>>1 | out->limb[i+1]<<(WBITS-1);
|
||||
for (i = 0; i < DECAF_448_SCALAR_LIMBS - 1; i++) {
|
||||
out->limb[i] = out->limb[i] >> 1 | out->limb[i + 1] << (WBITS - 1);
|
||||
}
|
||||
out->limb[i] = out->limb[i]>>1 | chain<<(WBITS-1);
|
||||
out->limb[i] = out->limb[i] >> 1 | chain << (WBITS - 1);
|
||||
}
|
||||
|
||||
|
@ -11,208 +11,212 @@
|
||||
*/
|
||||
|
||||
#ifndef __WORD_H__
|
||||
#define __WORD_H__
|
||||
# define __WORD_H__
|
||||
|
||||
#include <string.h>
|
||||
# include <string.h>
|
||||
|
||||
#include <assert.h>
|
||||
#include <openssl/e_os2.h>
|
||||
#include "arch_intrinsics.h"
|
||||
# include <assert.h>
|
||||
# include <openssl/e_os2.h>
|
||||
# include "arch_intrinsics.h"
|
||||
|
||||
#include "curve448utils.h"
|
||||
# include "curve448utils.h"
|
||||
|
||||
#ifndef _BSD_SOURCE
|
||||
#define _BSD_SOURCE 1
|
||||
#endif
|
||||
# ifndef _BSD_SOURCE
|
||||
# define _BSD_SOURCE 1
|
||||
# endif
|
||||
|
||||
#ifndef _DEFAULT_SOURCE
|
||||
#define _DEFAULT_SOURCE 1
|
||||
#endif
|
||||
# ifndef _DEFAULT_SOURCE
|
||||
# define _DEFAULT_SOURCE 1
|
||||
# endif
|
||||
|
||||
#include <stdlib.h>
|
||||
# include <stdlib.h>
|
||||
|
||||
#if defined(__ARM_NEON__)
|
||||
#include <arm_neon.h>
|
||||
#elif defined(__SSE2__)
|
||||
#if !defined(__GNUC__) || defined(__clang__) || __GNUC__ >= 5 || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
|
||||
#include <immintrin.h>
|
||||
#else
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
# if defined(__ARM_NEON__)
|
||||
# include <arm_neon.h>
|
||||
# elif defined(__SSE2__)
|
||||
# if !defined(__GNUC__) || defined(__clang__) || __GNUC__ >= 5 || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
|
||||
# include <immintrin.h>
|
||||
# else
|
||||
# include <emmintrin.h>
|
||||
# endif
|
||||
# endif
|
||||
|
||||
#if (ARCH_WORD_BITS == 64)
|
||||
typedef uint64_t word_t, mask_t;
|
||||
typedef __uint128_t dword_t;
|
||||
typedef int32_t hsword_t;
|
||||
typedef int64_t sword_t;
|
||||
typedef __int128_t dsword_t;
|
||||
#elif (ARCH_WORD_BITS == 32)
|
||||
typedef uint32_t word_t, mask_t;
|
||||
typedef uint64_t dword_t;
|
||||
typedef int16_t hsword_t;
|
||||
typedef int32_t sword_t;
|
||||
typedef int64_t dsword_t;
|
||||
#else
|
||||
#error "For now, libdecaf only supports 32- and 64-bit architectures."
|
||||
#endif
|
||||
|
||||
/* Scalar limbs are keyed off of the API word size instead of the arch word size. */
|
||||
#if DECAF_WORD_BITS == 64
|
||||
#define SC_LIMB(x) (x)
|
||||
#elif DECAF_WORD_BITS == 32
|
||||
#define SC_LIMB(x) ((uint32_t)x),(x>>32)
|
||||
#else
|
||||
#error "For now, libdecaf only supports 32- and 64-bit architectures."
|
||||
#endif
|
||||
# if (ARCH_WORD_BITS == 64)
|
||||
typedef uint64_t word_t, mask_t;
|
||||
typedef __uint128_t dword_t;
|
||||
typedef int32_t hsword_t;
|
||||
typedef int64_t sword_t;
|
||||
typedef __int128_t dsword_t;
|
||||
# elif (ARCH_WORD_BITS == 32)
|
||||
typedef uint32_t word_t, mask_t;
|
||||
typedef uint64_t dword_t;
|
||||
typedef int16_t hsword_t;
|
||||
typedef int32_t sword_t;
|
||||
typedef int64_t dsword_t;
|
||||
# else
|
||||
# error "For now, libdecaf only supports 32- and 64-bit architectures."
|
||||
# endif
|
||||
|
||||
#ifdef __ARM_NEON__
|
||||
typedef uint32x4_t vecmask_t;
|
||||
#elif defined(__clang__)
|
||||
typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
|
||||
typedef int64_t int64x2_t __attribute__((ext_vector_type(2)));
|
||||
typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
|
||||
typedef int64_t int64x4_t __attribute__((ext_vector_type(4)));
|
||||
typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
|
||||
typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
|
||||
typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
|
||||
typedef int32_t int32x2_t __attribute__((ext_vector_type(2)));
|
||||
typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
|
||||
typedef int32_t int32x8_t __attribute__((ext_vector_type(8)));
|
||||
typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
|
||||
#else /* GCC, hopefully? */
|
||||
typedef uint64_t uint64x2_t __attribute__((vector_size(16)));
|
||||
typedef int64_t int64x2_t __attribute__((vector_size(16)));
|
||||
typedef uint64_t uint64x4_t __attribute__((vector_size(32)));
|
||||
typedef int64_t int64x4_t __attribute__((vector_size(32)));
|
||||
typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
|
||||
typedef int32_t int32x4_t __attribute__((vector_size(16)));
|
||||
typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
|
||||
typedef int32_t int32x2_t __attribute__((vector_size(8)));
|
||||
typedef uint32_t uint32x8_t __attribute__((vector_size(32)));
|
||||
typedef int32_t int32x8_t __attribute__((vector_size(32)));
|
||||
typedef word_t vecmask_t __attribute__((vector_size(32)));
|
||||
#endif
|
||||
/*
|
||||
* Scalar limbs are keyed off of the API word size instead of the arch word
|
||||
* size.
|
||||
*/
|
||||
# if DECAF_WORD_BITS == 64
|
||||
# define SC_LIMB(x) (x)
|
||||
# elif DECAF_WORD_BITS == 32
|
||||
# define SC_LIMB(x) ((uint32_t)x),(x>>32)
|
||||
# else
|
||||
# error "For now, libdecaf only supports 32- and 64-bit architectures."
|
||||
# endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#define VECTOR_ALIGNED __attribute__((aligned(32)))
|
||||
typedef uint32x8_t big_register_t;
|
||||
typedef uint64x4_t uint64xn_t;
|
||||
typedef uint32x8_t uint32xn_t;
|
||||
# ifdef __ARM_NEON__
|
||||
typedef uint32x4_t vecmask_t;
|
||||
# elif defined(__clang__)
|
||||
typedef uint64_t uint64x2_t __attribute__ ((ext_vector_type(2)));
|
||||
typedef int64_t int64x2_t __attribute__ ((ext_vector_type(2)));
|
||||
typedef uint64_t uint64x4_t __attribute__ ((ext_vector_type(4)));
|
||||
typedef int64_t int64x4_t __attribute__ ((ext_vector_type(4)));
|
||||
typedef uint32_t uint32x4_t __attribute__ ((ext_vector_type(4)));
|
||||
typedef int32_t int32x4_t __attribute__ ((ext_vector_type(4)));
|
||||
typedef uint32_t uint32x2_t __attribute__ ((ext_vector_type(2)));
|
||||
typedef int32_t int32x2_t __attribute__ ((ext_vector_type(2)));
|
||||
typedef uint32_t uint32x8_t __attribute__ ((ext_vector_type(8)));
|
||||
typedef int32_t int32x8_t __attribute__ ((ext_vector_type(8)));
|
||||
typedef word_t vecmask_t __attribute__ ((ext_vector_type(4)));
|
||||
# else /* GCC, hopefully? */
|
||||
typedef uint64_t uint64x2_t __attribute__ ((vector_size(16)));
|
||||
typedef int64_t int64x2_t __attribute__ ((vector_size(16)));
|
||||
typedef uint64_t uint64x4_t __attribute__ ((vector_size(32)));
|
||||
typedef int64_t int64x4_t __attribute__ ((vector_size(32)));
|
||||
typedef uint32_t uint32x4_t __attribute__ ((vector_size(16)));
|
||||
typedef int32_t int32x4_t __attribute__ ((vector_size(16)));
|
||||
typedef uint32_t uint32x2_t __attribute__ ((vector_size(8)));
|
||||
typedef int32_t int32x2_t __attribute__ ((vector_size(8)));
|
||||
typedef uint32_t uint32x8_t __attribute__ ((vector_size(32)));
|
||||
typedef int32_t int32x8_t __attribute__ ((vector_size(32)));
|
||||
typedef word_t vecmask_t __attribute__ ((vector_size(32)));
|
||||
# endif
|
||||
|
||||
static ossl_inline big_register_t
|
||||
br_set_to_mask(mask_t x) {
|
||||
uint32_t y = (uint32_t)x;
|
||||
big_register_t ret = {y,y,y,y,y,y,y,y};
|
||||
return ret;
|
||||
}
|
||||
#elif defined(__SSE2__)
|
||||
#define VECTOR_ALIGNED __attribute__((aligned(16)))
|
||||
typedef uint32x4_t big_register_t;
|
||||
typedef uint64x2_t uint64xn_t;
|
||||
typedef uint32x4_t uint32xn_t;
|
||||
# if defined(__AVX2__)
|
||||
# define VECTOR_ALIGNED __attribute__((aligned(32)))
|
||||
typedef uint32x8_t big_register_t;
|
||||
typedef uint64x4_t uint64xn_t;
|
||||
typedef uint32x8_t uint32xn_t;
|
||||
|
||||
static ossl_inline big_register_t
|
||||
br_set_to_mask(mask_t x) {
|
||||
uint32_t y = x;
|
||||
big_register_t ret = {y,y,y,y};
|
||||
return ret;
|
||||
}
|
||||
#elif defined(__ARM_NEON__)
|
||||
#define VECTOR_ALIGNED __attribute__((aligned(16)))
|
||||
typedef uint32x4_t big_register_t;
|
||||
typedef uint64x2_t uint64xn_t;
|
||||
typedef uint32x4_t uint32xn_t;
|
||||
|
||||
static ossl_inline big_register_t
|
||||
br_set_to_mask(mask_t x) {
|
||||
return vdupq_n_u32(x);
|
||||
}
|
||||
#elif defined(_WIN64) || defined(__amd64__) || defined(__X86_64__) \
|
||||
static ossl_inline big_register_t br_set_to_mask(mask_t x)
|
||||
{
|
||||
uint32_t y = (uint32_t)x;
|
||||
big_register_t ret = { y, y, y, y, y, y, y, y };
|
||||
return ret;
|
||||
}
|
||||
# elif defined(__SSE2__)
|
||||
# define VECTOR_ALIGNED __attribute__((aligned(16)))
|
||||
typedef uint32x4_t big_register_t;
|
||||
typedef uint64x2_t uint64xn_t;
|
||||
typedef uint32x4_t uint32xn_t;
|
||||
|
||||
static ossl_inline big_register_t br_set_to_mask(mask_t x)
|
||||
{
|
||||
uint32_t y = x;
|
||||
big_register_t ret = { y, y, y, y };
|
||||
return ret;
|
||||
}
|
||||
# elif defined(__ARM_NEON__)
|
||||
# define VECTOR_ALIGNED __attribute__((aligned(16)))
|
||||
typedef uint32x4_t big_register_t;
|
||||
typedef uint64x2_t uint64xn_t;
|
||||
typedef uint32x4_t uint32xn_t;
|
||||
|
||||
static ossl_inline big_register_t br_set_to_mask(mask_t x)
|
||||
{
|
||||
return vdupq_n_u32(x);
|
||||
}
|
||||
# elif defined(_WIN64) || defined(__amd64__) || defined(__X86_64__) \
|
||||
|| defined(__aarch64__)
|
||||
#define VECTOR_ALIGNED __attribute__((aligned(8)))
|
||||
typedef uint64_t big_register_t, uint64xn_t;
|
||||
# define VECTOR_ALIGNED __attribute__((aligned(8)))
|
||||
typedef uint64_t big_register_t, uint64xn_t;
|
||||
|
||||
typedef uint32_t uint32xn_t;
|
||||
static ossl_inline big_register_t
|
||||
br_set_to_mask(mask_t x) {
|
||||
return (big_register_t)x;
|
||||
}
|
||||
#else
|
||||
#define VECTOR_ALIGNED __attribute__((aligned(4)))
|
||||
typedef uint64_t uint64xn_t;
|
||||
typedef uint32_t uint32xn_t;
|
||||
typedef uint32_t big_register_t;
|
||||
typedef uint32_t uint32xn_t;
|
||||
static ossl_inline big_register_t br_set_to_mask(mask_t x)
|
||||
{
|
||||
return (big_register_t) x;
|
||||
}
|
||||
# else
|
||||
# define VECTOR_ALIGNED __attribute__((aligned(4)))
|
||||
typedef uint64_t uint64xn_t;
|
||||
typedef uint32_t uint32xn_t;
|
||||
typedef uint32_t big_register_t;
|
||||
|
||||
static ossl_inline big_register_t
|
||||
br_set_to_mask(mask_t x) {
|
||||
return (big_register_t)x;
|
||||
}
|
||||
#endif
|
||||
static ossl_inline big_register_t br_set_to_mask(mask_t x)
|
||||
{
|
||||
return (big_register_t) x;
|
||||
}
|
||||
# endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
static ossl_inline big_register_t
|
||||
br_is_zero(big_register_t x) {
|
||||
return (big_register_t)(x == br_set_to_mask(0));
|
||||
}
|
||||
#elif defined(__SSE2__)
|
||||
static ossl_inline big_register_t
|
||||
br_is_zero(big_register_t x) {
|
||||
return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128());
|
||||
//return (big_register_t)(x == br_set_to_mask(0));
|
||||
}
|
||||
#elif defined(__ARM_NEON__)
|
||||
static ossl_inline big_register_t
|
||||
br_is_zero(big_register_t x) {
|
||||
return vceqq_u32(x,x^x);
|
||||
}
|
||||
#else
|
||||
#define br_is_zero word_is_zero
|
||||
#endif
|
||||
# if defined(__AVX2__)
|
||||
static ossl_inline big_register_t br_is_zero(big_register_t x)
|
||||
{
|
||||
return (big_register_t) (x == br_set_to_mask(0));
|
||||
}
|
||||
# elif defined(__SSE2__)
|
||||
static ossl_inline big_register_t br_is_zero(big_register_t x)
|
||||
{
|
||||
return (big_register_t) _mm_cmpeq_epi32((__m128i) x, _mm_setzero_si128());
|
||||
// return (big_register_t)(x == br_set_to_mask(0));
|
||||
}
|
||||
# elif defined(__ARM_NEON__)
|
||||
static ossl_inline big_register_t br_is_zero(big_register_t x)
|
||||
{
|
||||
return vceqq_u32(x, x ^ x);
|
||||
}
|
||||
# else
|
||||
# define br_is_zero word_is_zero
|
||||
# endif
|
||||
|
||||
/* PERF: vectorize vs unroll */
|
||||
#ifdef __clang__
|
||||
#if 100*__clang_major__ + __clang_minor__ > 305
|
||||
#define UNROLL _Pragma("clang loop unroll(full)")
|
||||
#endif
|
||||
#endif
|
||||
# ifdef __clang__
|
||||
# if 100*__clang_major__ + __clang_minor__ > 305
|
||||
# define UNROLL _Pragma("clang loop unroll(full)")
|
||||
# endif
|
||||
# endif
|
||||
|
||||
#ifndef UNROLL
|
||||
#define UNROLL
|
||||
#endif
|
||||
# ifndef UNROLL
|
||||
# define UNROLL
|
||||
# endif
|
||||
|
||||
/* The plan on booleans:
|
||||
*
|
||||
* The external interface uses decaf_bool_t, but this might be a different
|
||||
* size than our particular arch's word_t (and thus mask_t). Also, the caller
|
||||
* isn't guaranteed to pass it as nonzero. So bool_to_mask converts word sizes
|
||||
* and checks nonzero.
|
||||
*
|
||||
* On the flip side, mask_t is always -1 or 0, but it might be a different size
|
||||
* than decaf_bool_t.
|
||||
*
|
||||
* On the third hand, we have success vs boolean types, but that's handled in
|
||||
* common.h: it converts between decaf_bool_t and decaf_error_t.
|
||||
/*
|
||||
* The plan on booleans: The external interface uses decaf_bool_t, but this
|
||||
* might be a different size than our particular arch's word_t (and thus
|
||||
* mask_t). Also, the caller isn't guaranteed to pass it as nonzero. So
|
||||
* bool_to_mask converts word sizes and checks nonzero. On the flip side,
|
||||
* mask_t is always -1 or 0, but it might be a different size than
|
||||
* decaf_bool_t. On the third hand, we have success vs boolean types, but
|
||||
* that's handled in common.h: it converts between decaf_bool_t and
|
||||
* decaf_error_t.
|
||||
*/
|
||||
static ossl_inline decaf_bool_t mask_to_bool (mask_t m) {
|
||||
return (decaf_sword_t)(sword_t)m;
|
||||
static ossl_inline decaf_bool_t mask_to_bool(mask_t m)
|
||||
{
|
||||
return (decaf_sword_t) (sword_t) m;
|
||||
}
|
||||
|
||||
static ossl_inline mask_t bool_to_mask (decaf_bool_t m) {
|
||||
static ossl_inline mask_t bool_to_mask(decaf_bool_t m)
|
||||
{
|
||||
/* On most arches this will be optimized to a simple cast. */
|
||||
mask_t ret = 0;
|
||||
unsigned int i;
|
||||
|
||||
unsigned int limit = sizeof(decaf_bool_t)/sizeof(mask_t);
|
||||
if (limit < 1) limit = 1;
|
||||
for (i=0; i<limit; i++) {
|
||||
ret |= ~ word_is_zero(m >> (i*8*sizeof(word_t)));
|
||||
unsigned int limit = sizeof(decaf_bool_t) / sizeof(mask_t);
|
||||
if (limit < 1)
|
||||
limit = 1;
|
||||
for (i = 0; i < limit; i++) {
|
||||
ret |= ~word_is_zero(m >> (i * 8 * sizeof(word_t)));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ossl_inline void ignore_result ( decaf_bool_t boo ) {
|
||||
static ossl_inline void ignore_result(decaf_bool_t boo)
|
||||
{
|
||||
(void)boo;
|
||||
}
|
||||
|
||||
#endif /* __WORD_H__ */
|
||||
#endif /* __WORD_H__ */
|
||||
|
Loading…
Reference in New Issue
Block a user