Run util/openssl-format-source on the Curve448 code

Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de> (Merged from https://github.com/openssl/openssl/pull/5105)
2025-01-30 14:01:55 +08:00 · 2017-12-04 11:38:58 +00:00 · 2017-12-04 11:38:58 +00:00 · 205fd63881
commit 205fd63881
parent 1308e022e1
29 changed files with 4140 additions and 3030 deletions
--- a/crypto/ec/curve448/arch_32/arch_intrinsics.h
+++ b/crypto/ec/curve448/arch_32/arch_intrinsics.h
@ -11,20 +11,21 @@
 */

 #ifndef __ARCH_ARCH_32_ARCH_INTRINSICS_H__
-#define __ARCH_ARCH_32_ARCH_INTRINSICS_H__
+# define __ARCH_ARCH_32_ARCH_INTRINSICS_H__

-#define ARCH_WORD_BITS 32
+# define ARCH_WORD_BITS 32

-static __inline__ __attribute((always_inline,unused))
-uint32_t word_is_zero(uint32_t a) {
+static __inline__ __attribute((always_inline, unused))
+uint32_t word_is_zero(uint32_t a)
+{
    /* let's hope the compiler isn't clever enough to optimize this. */
-    return (((uint64_t)a)-1)>>32;
+    return (((uint64_t)a) - 1) >> 32;
 }

-static __inline__ __attribute((always_inline,unused))
-uint64_t widemul(uint32_t a, uint32_t b) {
+static __inline__ __attribute((always_inline, unused))
+uint64_t widemul(uint32_t a, uint32_t b)
+{
    return ((uint64_t)a) * b;
 }

-#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
-
+#endif                          /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
--- a/crypto/ec/curve448/arch_32/f_impl.c
+++ b/crypto/ec/curve448/arch_32/f_impl.c
@ -14,84 +14,80 @@

 #if (defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) && !I_HATE_UNROLLED_LOOPS) \
     || defined(DECAF_FORCE_UNROLL)
-#define REPEAT8(_x) _x _x _x _x _x _x _x _x
-#define FOR_LIMB(_i,_start,_end,_x) do { _i=_start; REPEAT8( if (_i<_end) { _x; } _i++;) } while (0)
+# define REPEAT8(_x) _x _x _x _x _x _x _x _x
+# define FOR_LIMB(_i,_start,_end,_x) do { _i=_start; REPEAT8( if (_i<_end) { _x; } _i++;) } while (0)
 #else
-#define FOR_LIMB(_i,_start,_end,_x) do { for (_i=_start; _i<_end; _i++) _x; } while (0)
+# define FOR_LIMB(_i,_start,_end,_x) do { for (_i=_start; _i<_end; _i++) _x; } while (0)
 #endif

-void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { 
+void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
+{
    const uint32_t *a = as->limb, *b = bs->limb;
    uint32_t *c = cs->limb;

    uint64_t accum0 = 0, accum1 = 0, accum2 = 0;
-    uint32_t mask = (1<<28) - 1;  
+    uint32_t mask = (1 << 28) - 1;

    uint32_t aa[8], bb[8];
-    
-    int i,j;
-    for (i=0; i<8; i++) {
-        aa[i] = a[i] + a[i+8];
-        bb[i] = b[i] + b[i+8];
+
+    int i, j;
+    for (i = 0; i < 8; i++) {
+        aa[i] = a[i] + a[i + 8];
+        bb[i] = b[i] + b[i + 8];
    }
-    
-    FOR_LIMB(j,0,8,{
-        accum2 = 0;
-    
-        FOR_LIMB (i,0,j+1,{
-            accum2 += widemul(a[j-i],b[i]);
-            accum1 += widemul(aa[j-i],bb[i]);
-            accum0 += widemul(a[8+j-i], b[8+i]);
-        });
-        
-        accum1 -= accum2;
-        accum0 += accum2;
-        accum2 = 0;
-    
-        FOR_LIMB (i,j+1,8,{
-            accum0 -= widemul(a[8+j-i], b[i]);
-            accum2 += widemul(aa[8+j-i], bb[i]);
-            accum1 += widemul(a[16+j-i], b[8+i]);
-        });

-        accum1 += accum2;
-        accum0 += accum2;
+    FOR_LIMB(j, 0, 8, {
+             accum2 = 0;
+             FOR_LIMB(i, 0, j + 1, {
+                      accum2 += widemul(a[j - i], b[i]);
+                      accum1 += widemul(aa[j - i], bb[i]);
+                      accum0 += widemul(a[8 + j - i], b[8 + i]);
+                      }
+             ); accum1 -= accum2; accum0 += accum2;
+             accum2 = 0;
+             FOR_LIMB(i, j + 1, 8, {
+                      accum0 -=
+                      widemul(a[8 + j - i], b[i]);
+                      accum2 +=
+                      widemul(aa[8 + j - i],
+                              bb[i]);
+                      accum1 += widemul(a[16 + j - i], b[8 + i]);
+                      }
+             );
+             accum1 += accum2;
+             accum0 += accum2;
+             c[j] = ((uint32_t)(accum0)) & mask;
+             c[j + 8] = ((uint32_t)(accum1)) & mask;
+             accum0 >>= 28; accum1 >>= 28;
+             });

-        c[j] = ((uint32_t)(accum0)) & mask;
-        c[j+8] = ((uint32_t)(accum1)) & mask;
-
-        accum0 >>= 28;
-        accum1 >>= 28;
-    });
-    
    accum0 += accum1;
    accum0 += c[8];
    accum1 += c[0];
    c[8] = ((uint32_t)(accum0)) & mask;
    c[0] = ((uint32_t)(accum1)) & mask;
-    
+
    accum0 >>= 28;
    accum1 >>= 28;
    c[9] += ((uint32_t)(accum0));
    c[1] += ((uint32_t)(accum1));
 }

-void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
+void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
+{
    const uint32_t *a = as->limb;
    uint32_t *c = cs->limb;
    uint64_t accum0 = 0, accum8 = 0;
-    uint32_t mask = (1<<28)-1;  
+    uint32_t mask = (1 << 28) - 1;
    int i;

-    assert(b<1<<28);
+    assert(b < 1 << 28);

-    FOR_LIMB(i,0,8,{
-        accum0 += widemul(b, a[i]);
-        accum8 += widemul(b, a[i+8]);
-
-        c[i] = accum0 & mask; accum0 >>= 28;
-        c[i+8] = accum8 & mask; accum8 >>= 28;
-    });
+    FOR_LIMB(i, 0, 8, {
+             accum0 += widemul(b, a[i]); accum8 += widemul(b, a[i + 8]);
+             c[i] = accum0 & mask; accum0 >>= 28;
+             c[i + 8] = accum8 & mask; accum8 >>= 28;
+             });

    accum0 += accum8 + c[8];
    c[8] = accum0 & mask;
@ -102,7 +98,7 @@ void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
    c[1] += accum8 >> 28;
 }

-void gf_sqr (gf_s *__restrict__ cs, const gf as) {
-    gf_mul(cs,as,as); /* Performs better with a dedicated square */
+void gf_sqr(gf_s * __restrict__ cs, const gf as)
+{
+    gf_mul(cs, as, as);         /* Performs better with a dedicated square */
 }
-
--- a/crypto/ec/curve448/arch_32/f_impl.h
+++ b/crypto/ec/curve448/arch_32/f_impl.h
@ -13,43 +13,46 @@
 #define LIMB(x) (x)&((1<<28)-1), (x)>>28
 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
    {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}}
-    
+
 #define LIMB_PLACE_VALUE(i) 28

-void gf_add_RAW (gf out, const gf a, const gf b) {
+void gf_add_RAW(gf out, const gf a, const gf b)
+{
    unsigned int i;

-    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+    for (i = 0; i < sizeof(*out) / sizeof(out->limb[0]); i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
 }

-void gf_sub_RAW (gf out, const gf a, const gf b) {
+void gf_sub_RAW(gf out, const gf a, const gf b)
+{
    unsigned int i;

-    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+    for (i = 0; i < sizeof(*out) / sizeof(out->limb[0]); i++) {
        out->limb[i] = a->limb[i] - b->limb[i];
    }
 }

-void gf_bias (gf a, int amt) {
+void gf_bias(gf a, int amt)
+{
    unsigned int i;
-    uint32_t co1 = ((1<<28)-1)*amt, co2 = co1-amt;
+    uint32_t co1 = ((1 << 28) - 1) * amt, co2 = co1 - amt;

-    for (i=0; i<sizeof(*a)/sizeof(a->limb[0]); i++) {
-        a->limb[i] += (i==sizeof(*a)/sizeof(a->limb[0])/2) ? co2 : co1;
+    for (i = 0; i < sizeof(*a) / sizeof(a->limb[0]); i++) {
+        a->limb[i] += (i == sizeof(*a) / sizeof(a->limb[0]) / 2) ? co2 : co1;
    }
 }

-void gf_weak_reduce (gf a) {
-    uint32_t mask = (1<<28) - 1;
+void gf_weak_reduce(gf a)
+{
+    uint32_t mask = (1 << 28) - 1;
    uint32_t tmp = a->limb[15] >> 28;
    unsigned int i;

    a->limb[8] += tmp;
-    for (i=15; i>0; i--) {
-        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);
+    for (i = 15; i > 0; i--) {
+        a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 28);
    }
    a->limb[0] = (a->limb[0] & mask) + tmp;
 }
-
--- a/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h
+++ b/crypto/ec/curve448/arch_arm_32/arch_intrinsics.h
@ -11,22 +11,26 @@
 */

 #ifndef __ARCH_ARM_32_ARCH_INTRINSICS_H__
-#define __ARCH_ARM_32_ARCH_INTRINSICS_H__
+# define __ARCH_ARM_32_ARCH_INTRINSICS_H__

-#define ARCH_WORD_BITS 32
+# define ARCH_WORD_BITS 32

-static __inline__ __attribute((always_inline,unused))
-uint32_t word_is_zero(uint32_t a) {
+static __inline__ __attribute((always_inline, unused))
+uint32_t word_is_zero(uint32_t a)
+{
    uint32_t ret;
-    asm("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc");
+ asm("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
    return ret;
 }

-static __inline__ __attribute((always_inline,unused))
-uint64_t widemul(uint32_t a, uint32_t b) {
-    /* Could be UMULL, but it's hard to express to CC that the registers must be different */
-    return ((uint64_t)a) * b; 
+static __inline__ __attribute((always_inline, unused))
+uint64_t widemul(uint32_t a, uint32_t b)
+{
+    /*
+     * Could be UMULL, but it's hard to express to CC that the registers must
+     * be different
+     */
+    return ((uint64_t)a) * b;
 }

-#endif /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
-
+#endif                          /* __ARCH_ARM_32_ARCH_INTRINSICS_H__ */
--- a/crypto/ec/curve448/arch_arm_32/f_impl.c
+++ b/crypto/ec/curve448/arch_arm_32/f_impl.c
@ -12,100 +12,89 @@

 #include "f_field.h"

-static inline void __attribute__((gnu_inline,always_inline))
-smlal (
-    uint64_t *acc,
-    const uint32_t a,
-    const uint32_t b
-) {
+static inline void __attribute__ ((gnu_inline, always_inline))
+    smlal(uint64_t *acc, const uint32_t a, const uint32_t b)
+{

 #ifdef  __ARMEL__
-    uint32_t lo = *acc, hi = (*acc)>>32;
-    
-    __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
-        : [lo]"+&r"(lo), [hi]"+&r"(hi)
-        : [a]"r"(a), [b]"r"(b));
-    
-    *acc = lo + (((uint64_t)hi)<<32);
+    uint32_t lo = *acc, hi = (*acc) >> 32;
+
+    __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo),
+                         [hi] "+&r"(hi)
+                         :[a] "r"(a),[b] "r"(b));
+
+    *acc = lo + (((uint64_t)hi) << 32);
 #else
-    *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)b;
+    *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)b;
 #endif
 }

-static inline void __attribute__((gnu_inline,always_inline))
-smlal2 (
-    uint64_t *acc,
-    const uint32_t a,
-    const uint32_t b
-) {
+static inline void __attribute__ ((gnu_inline, always_inline))
+    smlal2(uint64_t *acc, const uint32_t a, const uint32_t b)
+{
 #ifdef __ARMEL__
-    uint32_t lo = *acc, hi = (*acc)>>32;
-    
-    __asm__ __volatile__ ("smlal %[lo], %[hi], %[a], %[b]"
-        : [lo]"+&r"(lo), [hi]"+&r"(hi)
-        : [a]"r"(a), [b]"r"(2*b));
-    
-    *acc = lo + (((uint64_t)hi)<<32);
+    uint32_t lo = *acc, hi = (*acc) >> 32;
+
+    __asm__ __volatile__("smlal %[lo], %[hi], %[a], %[b]":[lo] "+&r"(lo),
+                         [hi] "+&r"(hi)
+                         :[a] "r"(a),[b] "r"(2 * b));
+
+    *acc = lo + (((uint64_t)hi) << 32);
 #else
-    *acc += (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2);
+    *acc += (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2);
 #endif
 }

-static inline void __attribute__((gnu_inline,always_inline))
-smull (
-    uint64_t *acc,
-    const uint32_t a,
-    const uint32_t b
-) {
+static inline void __attribute__ ((gnu_inline, always_inline))
+    smull(uint64_t *acc, const uint32_t a, const uint32_t b)
+{
 #ifdef __ARMEL__
    uint32_t lo, hi;
-    
-    __asm__ __volatile__ ("smull %[lo], %[hi], %[a], %[b]"
-        : [lo]"=&r"(lo), [hi]"=&r"(hi)
-        : [a]"r"(a), [b]"r"(b));
-    
-    *acc = lo + (((uint64_t)hi)<<32);
+
+    __asm__ __volatile__("smull %[lo], %[hi], %[a], %[b]":[lo] "=&r"(lo),
+                         [hi] "=&r"(hi)
+                         :[a] "r"(a),[b] "r"(b));
+
+    *acc = lo + (((uint64_t)hi) << 32);
 #else
-    *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b;
+    *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)b;
 #endif
 }

-static inline void __attribute__((gnu_inline,always_inline))
-smull2 (
-    uint64_t *acc,
-    const uint32_t a,
-    const uint32_t b
-) {
+static inline void __attribute__ ((gnu_inline, always_inline))
+    smull2(uint64_t *acc, const uint32_t a, const uint32_t b)
+{
 #ifdef __ARMEL__
    uint32_t lo, hi;
-    
+
    __asm__ /*__volatile__*/ ("smull %[lo], %[hi], %[a], %[b]"
-        : [lo]"=&r"(lo), [hi]"=&r"(hi)
-        : [a]"r"(a), [b]"r"(2*b));
-    
-    *acc = lo + (((uint64_t)hi)<<32);
+ :                           [lo] "=&r"(lo),[hi] "=&r"(hi)
+ :                           [a] "r"(a),[b] "r"(2 * b));
+
+    *acc = lo + (((uint64_t)hi) << 32);
 #else
-    *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)(b * 2);
+    *acc = (int64_t)(int32_t)a *(int64_t)(int32_t)(b * 2);
 #endif
 }

-void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
-    
+void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
+{
+
    const uint32_t *a = as->limb, *b = bs->limb;
    uint32_t *c = cs->limb;

    uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1;
-    uint32_t mask = (1<<28) - 1;  
+    uint32_t mask = (1 << 28) - 1;

    uint32_t aa[8], bm[8];

    int i;
-    for (i=0; i<8; i++) {
-        aa[i] = a[i] + a[i+8];
-        bm[i] = b[i] - b[i+8];
+    for (i = 0; i < 8; i++) {
+        aa[i] = a[i] + a[i + 8];
+        bm[i] = b[i] - b[i + 8];
    }

-    uint32_t ax,bx;
+    uint32_t ax, bx;
    {
        /* t^3 terms */
        smull(&accum1, ax = aa[1], bx = b[15]);
@ -121,15 +110,15 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum1, ax, bx = b[10]);
        smlal(&accum3, ax = aa[7], bx);
        smlal(&accum1, ax, bx = b[9]);
-        
+
        accum0 = accum1;
        accum2 = accum3;
-        
+
        /* t^2 terms */
        smlal(&accum2, ax = aa[0], bx);
        smlal(&accum0, ax, bx = b[8]);
        smlal(&accum2, ax = aa[1], bx);
-        
+
        smlal(&accum0, ax = a[9], bx = b[7]);
        smlal(&accum2, ax = a[10], bx);
        smlal(&accum0, ax, bx = b[6]);
@ -143,14 +132,14 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum0, ax, bx = b[2]);
        smlal(&accum2, ax = a[15], bx);
        smlal(&accum0, ax, bx = b[1]);
-        
+
        /* t terms */
        accum1 += accum0;
        accum3 += accum2;
        smlal(&accum3, ax = a[8], bx);
        smlal(&accum1, ax, bx = b[0]);
        smlal(&accum3, ax = a[9], bx);
-        
+
        smlal(&accum1, ax = a[1], bx = bm[7]);
        smlal(&accum3, ax = a[2], bx);
        smlal(&accum1, ax, bx = bm[6]);
@ -164,20 +153,20 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum1, ax, bx = bm[2]);
        smlal(&accum3, ax = a[7], bx);
        smlal(&accum1, ax, bx = bm[1]);
-        
+
        /* 1 terms */
        smlal(&accum2, ax = a[0], bx);
        smlal(&accum0, ax, bx = bm[0]);
        smlal(&accum2, ax = a[1], bx);
-        
+
        accum2 += accum0 >> 28;
        accum3 += accum1 >> 28;
-        
+
        c[0] = ((uint32_t)(accum0)) & mask;
        c[1] = ((uint32_t)(accum2)) & mask;
        c[8] = ((uint32_t)(accum1)) & mask;
        c[9] = ((uint32_t)(accum3)) & mask;
-        
+
        accumC0 = accum2 >> 28;
        accumC1 = accum3 >> 28;
    }
@ -192,10 +181,10 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum1, ax, bx = b[12]);
        smlal(&accum3, ax = aa[7], bx);
        smlal(&accum1, ax, bx = b[11]);
-        
+
        accum0 = accum1;
        accum2 = accum3;
-        
+
        /* t^2 terms */
        smlal(&accum2, ax = aa[0], bx);
        smlal(&accum0, ax, bx = b[10]);
@ -204,7 +193,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum2, ax = aa[2], bx);
        smlal(&accum0, ax, bx = b[8]);
        smlal(&accum2, ax = aa[3], bx);
-        
+
        smlal(&accum0, ax = a[11], bx = b[7]);
        smlal(&accum2, ax = a[12], bx);
        smlal(&accum0, ax, bx = b[6]);
@ -214,7 +203,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum0, ax, bx = b[4]);
        smlal(&accum2, ax = a[15], bx);
        smlal(&accum0, ax, bx = b[3]);
-        
+
        /* t terms */
        accum1 += accum0;
        accum3 += accum2;
@ -225,7 +214,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum3, ax = a[10], bx);
        smlal(&accum1, ax, bx = b[0]);
        smlal(&accum3, ax = a[11], bx);
-        
+
        smlal(&accum1, ax = a[3], bx = bm[7]);
        smlal(&accum3, ax = a[4], bx);
        smlal(&accum1, ax, bx = bm[6]);
@ -235,7 +224,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum1, ax, bx = bm[4]);
        smlal(&accum3, ax = a[7], bx);
        smlal(&accum1, ax, bx = bm[3]);
-        
+
        /* 1 terms */
        smlal(&accum2, ax = a[0], bx);
        smlal(&accum0, ax, bx = bm[2]);
@ -244,34 +233,34 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum2, ax = a[2], bx);
        smlal(&accum0, ax, bx = bm[0]);
        smlal(&accum2, ax = a[3], bx);
-        
+
        accum0 += accumC0;
        accum1 += accumC1;
        accum2 += accum0 >> 28;
        accum3 += accum1 >> 28;
-        
+
        c[2] = ((uint32_t)(accum0)) & mask;
        c[3] = ((uint32_t)(accum2)) & mask;
        c[10] = ((uint32_t)(accum1)) & mask;
        c[11] = ((uint32_t)(accum3)) & mask;
-        
+
        accumC0 = accum2 >> 28;
        accumC1 = accum3 >> 28;
    }
    {
-        
+
        /* t^3 terms */
        smull(&accum1, ax = aa[5], bx = b[15]);
        smull(&accum3, ax = aa[6], bx);
        smlal(&accum1, ax, bx = b[14]);
        smlal(&accum3, ax = aa[7], bx);
        smlal(&accum1, ax, bx = b[13]);
-        
+
        accum0 = accum1;
        accum2 = accum3;
-        
+
        /* t^2 terms */
-        
+
        smlal(&accum2, ax = aa[0], bx);
        smlal(&accum0, ax, bx = b[12]);
        smlal(&accum2, ax = aa[1], bx);
@ -283,18 +272,17 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum2, ax = aa[4], bx);
        smlal(&accum0, ax, bx = b[8]);
        smlal(&accum2, ax = aa[5], bx);
-        
-        
+
        smlal(&accum0, ax = a[13], bx = b[7]);
        smlal(&accum2, ax = a[14], bx);
        smlal(&accum0, ax, bx = b[6]);
        smlal(&accum2, ax = a[15], bx);
        smlal(&accum0, ax, bx = b[5]);
-        
+
        /* t terms */
        accum1 += accum0;
        accum3 += accum2;
-        
+
        smlal(&accum3, ax = a[8], bx);
        smlal(&accum1, ax, bx = b[4]);
        smlal(&accum3, ax = a[9], bx);
@ -306,16 +294,15 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum3, ax = a[12], bx);
        smlal(&accum1, ax, bx = b[0]);
        smlal(&accum3, ax = a[13], bx);
-        
-        
+
        smlal(&accum1, ax = a[5], bx = bm[7]);
        smlal(&accum3, ax = a[6], bx);
        smlal(&accum1, ax, bx = bm[6]);
        smlal(&accum3, ax = a[7], bx);
        smlal(&accum1, ax, bx = bm[5]);
-        
+
        /* 1 terms */
-        
+
        smlal(&accum2, ax = a[0], bx);
        smlal(&accum0, ax, bx = bm[4]);
        smlal(&accum2, ax = a[1], bx);
@ -327,28 +314,28 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum2, ax = a[4], bx);
        smlal(&accum0, ax, bx = bm[0]);
        smlal(&accum2, ax = a[5], bx);
-        
+
        accum0 += accumC0;
        accum1 += accumC1;
        accum2 += accum0 >> 28;
        accum3 += accum1 >> 28;
-        
+
        c[4] = ((uint32_t)(accum0)) & mask;
        c[5] = ((uint32_t)(accum2)) & mask;
        c[12] = ((uint32_t)(accum1)) & mask;
        c[13] = ((uint32_t)(accum3)) & mask;
-        
+
        accumC0 = accum2 >> 28;
        accumC1 = accum3 >> 28;
    }
    {
-        
+
        /* t^3 terms */
        smull(&accum1, ax = aa[7], bx = b[15]);
        accum0 = accum1;
-        
+
        /* t^2 terms */
-        
+
        smull(&accum2, ax = aa[0], bx);
        smlal(&accum0, ax, bx = b[14]);
        smlal(&accum2, ax = aa[1], bx);
@ -364,14 +351,13 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum2, ax = aa[6], bx);
        smlal(&accum0, ax, bx = b[8]);
        smlal(&accum2, ax = aa[7], bx);
-        
-        
+
        smlal(&accum0, ax = a[15], bx = b[7]);
-        
+
        /* t terms */
        accum1 += accum0;
        accum3 = accum2;
-        
+
        smlal(&accum3, ax = a[8], bx);
        smlal(&accum1, ax, bx = b[6]);
        smlal(&accum3, ax = a[9], bx);
@ -387,12 +373,11 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum3, ax = a[14], bx);
        smlal(&accum1, ax, bx = b[0]);
        smlal(&accum3, ax = a[15], bx);
-        
-        
+
        smlal(&accum1, ax = a[7], bx = bm[7]);
-        
+
        /* 1 terms */
-        
+
        smlal(&accum2, ax = a[0], bx);
        smlal(&accum0, ax, bx = bm[6]);
        smlal(&accum2, ax = a[1], bx);
@ -408,17 +393,17 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        smlal(&accum2, ax = a[6], bx);
        smlal(&accum0, ax, bx = bm[0]);
        smlal(&accum2, ax = a[7], bx);
-        
+
        accum0 += accumC0;
        accum1 += accumC1;
        accum2 += accum0 >> 28;
        accum3 += accum1 >> 28;
-        
+
        c[6] = ((uint32_t)(accum0)) & mask;
        c[7] = ((uint32_t)(accum2)) & mask;
        c[14] = ((uint32_t)(accum1)) & mask;
        c[15] = ((uint32_t)(accum3)) & mask;
-        
+
        accum0 = accum2 >> 28;
        accum1 = accum3 >> 28;
    }
@ -428,28 +413,29 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    accum1 += c[0];
    c[8] = ((uint32_t)(accum0)) & mask;
    c[0] = ((uint32_t)(accum1)) & mask;
-    
+
    accum0 >>= 28;
    accum1 >>= 28;
    c[9] += ((uint32_t)(accum0));
    c[1] += ((uint32_t)(accum1));
 }

-void gf_sqr (gf_s *__restrict__ cs, const gf as) {
+void gf_sqr(gf_s * __restrict__ cs, const gf as)
+{
    const uint32_t *a = as->limb;
    uint32_t *c = cs->limb;

    uint64_t accum0 = 0, accum1 = 0, accum2, accum3, accumC0, accumC1, tmp;
-    uint32_t mask = (1<<28) - 1;  
+    uint32_t mask = (1 << 28) - 1;

    uint32_t bm[8];
-    
+
    int i;
-    for (i=0; i<8; i++) {
-        bm[i] = a[i] - a[i+8];
+    for (i = 0; i < 8; i++) {
+        bm[i] = a[i] - a[i + 8];
    }

-    uint32_t ax,bx;
+    uint32_t ax, bx;
    {
        /* t^3 terms */
        smull2(&accum1, ax = a[9], bx = a[15]);
@ -459,14 +445,14 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
        smlal2(&accum1, ax, bx = a[13]);
        smlal2(&accum3, ax = a[12], bx);
        smlal(&accum1, ax, ax);
-        
+
        accum0 = accum1;
        accum2 = accum3;
-        
+
        /* t^2 terms */
        smlal2(&accum2, ax = a[8], a[9]);
        smlal(&accum0, ax, ax);
-        
+
        smlal2(&accum0, ax = a[1], bx = a[7]);
        smlal2(&accum2, ax = a[2], bx);
        smlal2(&accum0, ax, bx = a[6]);
@ -474,18 +460,18 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
        smlal2(&accum0, ax, bx = a[5]);
        smlal2(&accum2, ax = a[4], bx);
        smlal(&accum0, ax, ax);
-        
+
        /* t terms */
        accum1 += accum0;
        accum3 += accum2;
        smlal2(&accum3, ax = a[0], bx = a[1]);
        smlal(&accum1, ax, ax);
-        
+
        accum1 = -accum1;
        accum3 = -accum3;
        accum2 = -accum2;
        accum0 = -accum0;
-        
+
        smlal2(&accum1, ax = bm[1], bx = bm[7]);
        smlal2(&accum3, ax = bm[2], bx);
        smlal2(&accum1, ax, bx = bm[6]);
@ -493,22 +479,26 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
        smlal2(&accum1, ax, bx = bm[5]);
        smlal2(&accum3, ax = bm[4], bx);
        smlal(&accum1, ax, ax);
-        
+
        /* 1 terms */
        smlal2(&accum2, ax = bm[0], bx = bm[1]);
        smlal(&accum0, ax, ax);
-        
-        tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
-        tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
-        
+
+        tmp = -accum3;
+        accum3 = tmp - accum2;
+        accum2 = tmp;
+        tmp = -accum1;
+        accum1 = tmp - accum0;
+        accum0 = tmp;
+
        accum2 += accum0 >> 28;
        accum3 += accum1 >> 28;
-        
+
        c[0] = ((uint32_t)(accum0)) & mask;
        c[1] = ((uint32_t)(accum2)) & mask;
        c[8] = ((uint32_t)(accum1)) & mask;
        c[9] = ((uint32_t)(accum3)) & mask;
-        
+
        accumC0 = accum2 >> 28;
        accumC1 = accum3 >> 28;
    }
@ -519,22 +509,22 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
        smlal2(&accum1, ax, bx = a[14]);
        smlal2(&accum3, ax = a[13], bx);
        smlal(&accum1, ax, ax);
-        
+
        accum0 = accum1;
        accum2 = accum3;
-        
+
        /* t^2 terms */
        smlal2(&accum2, ax = a[8], bx = a[11]);
        smlal2(&accum0, ax, bx = a[10]);
        smlal2(&accum2, ax = a[9], bx);
        smlal(&accum0, ax, ax);
-        
+
        smlal2(&accum0, ax = a[3], bx = a[7]);
        smlal2(&accum2, ax = a[4], bx);
        smlal2(&accum0, ax, bx = a[6]);
        smlal2(&accum2, ax = a[5], bx);
        smlal(&accum0, ax, ax);
-        
+
        /* t terms */
        accum1 += accum0;
        accum3 += accum2;
@ -542,119 +532,124 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
        smlal2(&accum1, ax, bx = a[2]);
        smlal2(&accum3, ax = a[1], bx);
        smlal(&accum1, ax, ax);
-        
+
        accum1 = -accum1;
        accum3 = -accum3;
        accum2 = -accum2;
        accum0 = -accum0;
-        
+
        smlal2(&accum1, ax = bm[3], bx = bm[7]);
        smlal2(&accum3, ax = bm[4], bx);
        smlal2(&accum1, ax, bx = bm[6]);
        smlal2(&accum3, ax = bm[5], bx);
        smlal(&accum1, ax, ax);
-        
+
        /* 1 terms */
        smlal2(&accum2, ax = bm[0], bx = bm[3]);
        smlal2(&accum0, ax, bx = bm[2]);
        smlal2(&accum2, ax = bm[1], bx);
        smlal(&accum0, ax, ax);
-        
-        
-        tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
-        tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
-        
+
+        tmp = -accum3;
+        accum3 = tmp - accum2;
+        accum2 = tmp;
+        tmp = -accum1;
+        accum1 = tmp - accum0;
+        accum0 = tmp;
+
        accum0 += accumC0;
        accum1 += accumC1;
        accum2 += accum0 >> 28;
        accum3 += accum1 >> 28;
-        
+
        c[2] = ((uint32_t)(accum0)) & mask;
        c[3] = ((uint32_t)(accum2)) & mask;
        c[10] = ((uint32_t)(accum1)) & mask;
        c[11] = ((uint32_t)(accum3)) & mask;
-        
+
        accumC0 = accum2 >> 28;
        accumC1 = accum3 >> 28;
    }
    {
-        
+
        /* t^3 terms */
        smull2(&accum1, ax = a[13], bx = a[15]);
        smull2(&accum3, ax = a[14], bx);
        smlal(&accum1, ax, ax);
-        
+
        accum0 = accum1;
        accum2 = accum3;
-        
+
        /* t^2 terms */
-        
+
        smlal2(&accum2, ax = a[8], bx = a[13]);
        smlal2(&accum0, ax, bx = a[12]);
        smlal2(&accum2, ax = a[9], bx);
        smlal2(&accum0, ax, bx = a[11]);
        smlal2(&accum2, ax = a[10], bx);
        smlal(&accum0, ax, ax);
-        
-        
+
        smlal2(&accum0, ax = a[5], bx = a[7]);
        smlal2(&accum2, ax = a[6], bx);
        smlal(&accum0, ax, ax);
-        
+
        /* t terms */
        accum1 += accum0;
        accum3 += accum2;
-        
+
        smlal2(&accum3, ax = a[0], bx = a[5]);
        smlal2(&accum1, ax, bx = a[4]);
        smlal2(&accum3, ax = a[1], bx);
        smlal2(&accum1, ax, bx = a[3]);
        smlal2(&accum3, ax = a[2], bx);
        smlal(&accum1, ax, ax);
-        
+
        accum1 = -accum1;
        accum3 = -accum3;
        accum2 = -accum2;
        accum0 = -accum0;
-        
+
        smlal2(&accum1, ax = bm[5], bx = bm[7]);
        smlal2(&accum3, ax = bm[6], bx);
        smlal(&accum1, ax, ax);
-        
+
        /* 1 terms */
-        
+
        smlal2(&accum2, ax = bm[0], bx = bm[5]);
        smlal2(&accum0, ax, bx = bm[4]);
        smlal2(&accum2, ax = bm[1], bx);
        smlal2(&accum0, ax, bx = bm[3]);
        smlal2(&accum2, ax = bm[2], bx);
        smlal(&accum0, ax, ax);
-        
-        
-        tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
-        tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
-        
+
+        tmp = -accum3;
+        accum3 = tmp - accum2;
+        accum2 = tmp;
+        tmp = -accum1;
+        accum1 = tmp - accum0;
+        accum0 = tmp;
+
        accum0 += accumC0;
        accum1 += accumC1;
        accum2 += accum0 >> 28;
        accum3 += accum1 >> 28;
-        
+
        c[4] = ((uint32_t)(accum0)) & mask;
        c[5] = ((uint32_t)(accum2)) & mask;
        c[12] = ((uint32_t)(accum1)) & mask;
        c[13] = ((uint32_t)(accum3)) & mask;
-        
+
        accumC0 = accum2 >> 28;
        accumC1 = accum3 >> 28;
    }
    {
-        
+
        /* t^3 terms */
        smull(&accum1, ax = a[15], bx = a[15]);
        accum0 = accum1;
-        
+
        /* t^2 terms */
-        
+
        smull2(&accum2, ax = a[8], bx);
        smlal2(&accum0, ax, bx = a[14]);
        smlal2(&accum2, ax = a[9], bx);
@ -663,14 +658,13 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
        smlal2(&accum0, ax, bx = a[12]);
        smlal2(&accum2, ax = a[11], bx);
        smlal(&accum0, ax, ax);
-        
-        
+
        smlal(&accum0, ax = a[7], bx = a[7]);
-        
+
        /* t terms */
        accum1 += accum0;
        accum3 = accum2;
-        
+
        smlal2(&accum3, ax = a[0], bx);
        smlal2(&accum1, ax, bx = a[6]);
        smlal2(&accum3, ax = a[1], bx);
@ -679,17 +673,17 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
        smlal2(&accum1, ax, bx = a[4]);
        smlal2(&accum3, ax = a[3], bx);
        smlal(&accum1, ax, ax);
-        
+
        accum1 = -accum1;
        accum3 = -accum3;
        accum2 = -accum2;
        accum0 = -accum0;
-        
+
        bx = bm[7];
        smlal(&accum1, bx, bx);
-        
+
        /* 1 terms */
-        
+
        smlal2(&accum2, ax = bm[0], bx);
        smlal2(&accum0, ax, bx = bm[6]);
        smlal2(&accum2, ax = bm[1], bx);
@ -698,21 +692,24 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
        smlal2(&accum0, ax, bx = bm[4]);
        smlal2(&accum2, ax = bm[3], bx);
        smlal(&accum0, ax, ax);
-        
-        tmp = -accum3; accum3 = tmp-accum2; accum2 = tmp;
-        tmp = -accum1; accum1 = tmp-accum0; accum0 = tmp;
-        
-        
+
+        tmp = -accum3;
+        accum3 = tmp - accum2;
+        accum2 = tmp;
+        tmp = -accum1;
+        accum1 = tmp - accum0;
+        accum0 = tmp;
+
        accum0 += accumC0;
        accum1 += accumC1;
        accum2 += accum0 >> 28;
        accum3 += accum1 >> 28;
-        
+
        c[6] = ((uint32_t)(accum0)) & mask;
        c[7] = ((uint32_t)(accum2)) & mask;
        c[14] = ((uint32_t)(accum1)) & mask;
        c[15] = ((uint32_t)(accum3)) & mask;
-        
+
        accum0 = accum2 >> 28;
        accum1 = accum3 >> 28;
    }
@ -722,21 +719,18 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
    accum1 += c[0];
    c[8] = ((uint32_t)(accum0)) & mask;
    c[0] = ((uint32_t)(accum1)) & mask;
-    
+
    accum0 >>= 28;
    accum1 >>= 28;
    c[9] += ((uint32_t)(accum0));
    c[1] += ((uint32_t)(accum1));
 }

-void gf_mulw_unsigned (
-    gf_s *__restrict__ cs,
-    const gf as,
-    uint32_t b
-) {
-    uint32_t mask = (1ull<<28)-1;  
+void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
+{
+    uint32_t mask = (1ull << 28) - 1;
    assert(b <= mask);
-    
+
    const uint32_t *a = as->limb;
    uint32_t *c = cs->limb;

@ -745,75 +739,99 @@ void gf_mulw_unsigned (
    int i;

    uint32_t c0, c8, n0, n8;
-    c0 = a[0]; c8 = a[8];
+    c0 = a[0];
+    c8 = a[8];
    accum0 = widemul(b, c0);
    accum8 = widemul(b, c8);

-    c[0] = accum0 & mask; accum0 >>= 28;
-    c[8] = accum8 & mask; accum8 >>= 28;
-    
-    i=1;
+    c[0] = accum0 & mask;
+    accum0 >>= 28;
+    c[8] = accum8 & mask;
+    accum8 >>= 28;
+
+    i = 1;
    {
-        n0 = a[i]; n8 = a[i+8];
+        n0 = a[i];
+        n8 = a[i + 8];
        smlal(&accum0, b, n0);
        smlal(&accum8, b, n8);
-        
-        c[i] = accum0 & mask; accum0 >>= 28;
-        c[i+8] = accum8 & mask; accum8 >>= 28;
+
+        c[i] = accum0 & mask;
+        accum0 >>= 28;
+        c[i + 8] = accum8 & mask;
+        accum8 >>= 28;
        i++;
    }
    {
-        c0 = a[i]; c8 = a[i+8];
+        c0 = a[i];
+        c8 = a[i + 8];
        smlal(&accum0, b, c0);
        smlal(&accum8, b, c8);

-        c[i] = accum0 & mask; accum0 >>= 28;
-        c[i+8] = accum8 & mask; accum8 >>= 28;
+        c[i] = accum0 & mask;
+        accum0 >>= 28;
+        c[i + 8] = accum8 & mask;
+        accum8 >>= 28;
        i++;
    }
    {
-        n0 = a[i]; n8 = a[i+8];
+        n0 = a[i];
+        n8 = a[i + 8];
        smlal(&accum0, b, n0);
        smlal(&accum8, b, n8);

-        c[i] = accum0 & mask; accum0 >>= 28;
-        c[i+8] = accum8 & mask; accum8 >>= 28;
+        c[i] = accum0 & mask;
+        accum0 >>= 28;
+        c[i + 8] = accum8 & mask;
+        accum8 >>= 28;
        i++;
    }
    {
-        c0 = a[i]; c8 = a[i+8];
+        c0 = a[i];
+        c8 = a[i + 8];
        smlal(&accum0, b, c0);
        smlal(&accum8, b, c8);

-        c[i] = accum0 & mask; accum0 >>= 28;
-        c[i+8] = accum8 & mask; accum8 >>= 28;
+        c[i] = accum0 & mask;
+        accum0 >>= 28;
+        c[i + 8] = accum8 & mask;
+        accum8 >>= 28;
        i++;
    }
    {
-        n0 = a[i]; n8 = a[i+8];
+        n0 = a[i];
+        n8 = a[i + 8];
        smlal(&accum0, b, n0);
        smlal(&accum8, b, n8);

-        c[i] = accum0 & mask; accum0 >>= 28;
-        c[i+8] = accum8 & mask; accum8 >>= 28;
+        c[i] = accum0 & mask;
+        accum0 >>= 28;
+        c[i + 8] = accum8 & mask;
+        accum8 >>= 28;
        i++;
    }
    {
-        c0 = a[i]; c8 = a[i+8];
+        c0 = a[i];
+        c8 = a[i + 8];
        smlal(&accum0, b, c0);
        smlal(&accum8, b, c8);
-        
-        c[i] = accum0 & mask; accum0 >>= 28;
-        c[i+8] = accum8 & mask; accum8 >>= 28;
+
+        c[i] = accum0 & mask;
+        accum0 >>= 28;
+        c[i + 8] = accum8 & mask;
+        accum8 >>= 28;
        i++;
    }
    {
-        n0 = a[i]; n8 = a[i+8];
+        n0 = a[i];
+        n8 = a[i + 8];
        smlal(&accum0, b, n0);
        smlal(&accum8, b, n8);

-        c[i] = accum0 & mask; accum0 >>= 28;
-        c[i+8] = accum8 & mask; accum8 >>= 28;
+        c[i] = accum0 & mask;
+        accum0 >>= 28;
+        c[i + 8] = accum8 & mask;
+        accum8 >>= 28;
        i++;
    }

--- a/crypto/ec/curve448/arch_arm_32/f_impl.h
+++ b/crypto/ec/curve448/arch_arm_32/f_impl.h
@ -14,48 +14,52 @@
 #define LIMB(x) (x##ull)&((1ull<<28)-1), (x##ull)>>28
 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
    {{LIMB(a),LIMB(b),LIMB(c),LIMB(d),LIMB(e),LIMB(f),LIMB(g),LIMB(h)}}
-    
+
 #define LIMB_PLACE_VALUE(i) 28

-void gf_add_RAW (gf out, const gf a, const gf b) {
-    for (unsigned int i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
-        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
+void gf_add_RAW(gf out, const gf a, const gf b)
+{
+    for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
+        ((uint32xn_t *) out)[i] =
+            ((const uint32xn_t *)a)[i] + ((const uint32xn_t *)b)[i];
    }
    /*
-    for (unsigned int i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
-        out->limb[i] = a->limb[i] + b->limb[i];
-    }
-    */
+     * for (unsigned int i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+     * out->limb[i] = a->limb[i] + b->limb[i]; }
+     */
 }

-void gf_sub_RAW (gf out, const gf a, const gf b) {
-    for (unsigned int i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
-        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
+void gf_sub_RAW(gf out, const gf a, const gf b)
+{
+    for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
+        ((uint32xn_t *) out)[i] =
+            ((const uint32xn_t *)a)[i] - ((const uint32xn_t *)b)[i];
    }
    /*
-    for (unsigned int i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
-        out->limb[i] = a->limb[i] - b->limb[i];
-    }
-    */
+     * for (unsigned int i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+     * out->limb[i] = a->limb[i] - b->limb[i]; }
+     */
 }

-void gf_bias (gf a, int amt) {
-    uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
-    uint32x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
-    uint32x4_t *aa = (uint32x4_t*) a;
+void gf_bias(gf a, int amt)
+{
+    uint32_t co1 = ((1ull << 28) - 1) * amt, co2 = co1 - amt;
+    uint32x4_t lo = { co1, co1, co1, co1 }, hi = {
+    co2, co1, co1, co1};
+    uint32x4_t *aa = (uint32x4_t *) a;
    aa[0] += lo;
    aa[1] += lo;
    aa[2] += hi;
    aa[3] += lo;
 }

-void gf_weak_reduce (gf a) {
-    uint64_t mask = (1ull<<28) - 1;
+void gf_weak_reduce(gf a)
+{
+    uint64_t mask = (1ull << 28) - 1;
    uint64_t tmp = a->limb[15] >> 28;
    a->limb[8] += tmp;
-    for (unsigned int i=15; i>0; i--) {
-        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>28);
+    for (unsigned int i = 15; i > 0; i--) {
+        a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 28);
    }
    a->limb[0] = (a->limb[0] & mask) + tmp;
 }
-
--- a/crypto/ec/curve448/arch_neon/arch_intrinsics.h
+++ b/crypto/ec/curve448/arch_neon/arch_intrinsics.h
@ -11,22 +11,26 @@
 */

 #ifndef __ARCH_NEON_ARCH_INTRINSICS_H__
-#define __ARCH_NEON_ARCH_INTRINSICS_H__
+# define __ARCH_NEON_ARCH_INTRINSICS_H__

-#define ARCH_WORD_BITS 32
+# define ARCH_WORD_BITS 32

-static __inline__ __attribute((always_inline,unused))
-uint32_t word_is_zero(uint32_t a) {
+static __inline__ __attribute((always_inline, unused))
+uint32_t word_is_zero(uint32_t a)
+{
    uint32_t ret;
-    __asm__("subs %0, %1, #1;\n\tsbc %0, %0, %0" : "=r"(ret) : "r"(a) : "cc");
+ __asm__("subs %0, %1, #1;\n\tsbc %0, %0, %0": "=r"(ret): "r"(a):"cc");
    return ret;
 }

-static __inline__ __attribute((always_inline,unused))
-uint64_t widemul(uint32_t a, uint32_t b) {
-    /* Could be UMULL, but it's hard to express to CC that the registers must be different */
-    return ((uint64_t)a) * b; 
+static __inline__ __attribute((always_inline, unused))
+uint64_t widemul(uint32_t a, uint32_t b)
+{
+    /*
+     * Could be UMULL, but it's hard to express to CC that the registers must
+     * be different
+     */
+    return ((uint64_t)a) * b;
 }

-#endif /* __ARCH_NEON_ARCH_INTRINSICS_H__ */
-
+#endif                          /* __ARCH_NEON_ARCH_INTRINSICS_H__ */
--- a/crypto/ec/curve448/arch_neon/f_impl.c
+++ b/crypto/ec/curve448/arch_neon/f_impl.c
--- a/crypto/ec/curve448/arch_neon/f_impl.h
+++ b/crypto/ec/curve448/arch_neon/f_impl.h
@ -15,50 +15,55 @@
 #define USE_NEON_PERM 1
 #define LIMBHI(x) ((x##ull)>>28)
 #define LIMBLO(x) ((x##ull)&((1ull<<28)-1))
-#  define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
+#define FIELD_LITERAL(a,b,c,d,e,f,g,h) \
    {{LIMBLO(a),LIMBLO(e), LIMBHI(a),LIMBHI(e), \
      LIMBLO(b),LIMBLO(f), LIMBHI(b),LIMBHI(f), \
      LIMBLO(c),LIMBLO(g), LIMBHI(c),LIMBHI(g), \
      LIMBLO(d),LIMBLO(h), LIMBHI(d),LIMBHI(h)}}
-    
+
 #define LIMB_PLACE_VALUE(i) 28

-void gf_add_RAW (gf out, const gf a, const gf b) {
-    for (unsigned int i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
-        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] + ((const uint32xn_t*)b)[i];
+void gf_add_RAW(gf out, const gf a, const gf b)
+{
+    for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
+        ((uint32xn_t *) out)[i] =
+            ((const uint32xn_t *)a)[i] + ((const uint32xn_t *)b)[i];
    }
 }

-void gf_sub_RAW (gf out, const gf a, const gf b) {
-    for (unsigned int i=0; i<sizeof(*out)/sizeof(uint32xn_t); i++) {
-        ((uint32xn_t*)out)[i] = ((const uint32xn_t*)a)[i] - ((const uint32xn_t*)b)[i];
+void gf_sub_RAW(gf out, const gf a, const gf b)
+{
+    for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint32xn_t); i++) {
+        ((uint32xn_t *) out)[i] =
+            ((const uint32xn_t *)a)[i] - ((const uint32xn_t *)b)[i];
    }
    /*
-    unsigned int i;
-    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
-        out->limb[i] = a->limb[i] - b->limb[i];
-    }
-    */
+     * unsigned int i; for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+     * out->limb[i] = a->limb[i] - b->limb[i]; }
+     */
 }

-void gf_bias (gf a, int amt) {
-    uint32_t co1 = ((1ull<<28)-1)*amt, co2 = co1-amt;
-    uint32x4_t lo = {co1,co2,co1,co1}, hi = {co1,co1,co1,co1};
-    uint32x4_t *aa = (uint32x4_t*) a;
+void gf_bias(gf a, int amt)
+{
+    uint32_t co1 = ((1ull << 28) - 1) * amt, co2 = co1 - amt;
+    uint32x4_t lo = { co1, co2, co1, co1 }, hi = {
+    co1, co1, co1, co1};
+    uint32x4_t *aa = (uint32x4_t *) a;
    aa[0] += lo;
    aa[1] += hi;
    aa[2] += hi;
    aa[3] += hi;
 }

-void gf_weak_reduce (gf a) {
+void gf_weak_reduce(gf a)
+{

-    uint32x2_t *aa = (uint32x2_t*) a, vmask = {(1ull<<28)-1, (1ull<<28)-1}, vm2 = {0,-1},
-       tmp = vshr_n_u32(aa[7],28);
-       
-    for (unsigned int i=7; i>=1; i--) {
-        aa[i] = vsra_n_u32(aa[i] & vmask, aa[i-1], 28);
+    uint32x2_t *aa = (uint32x2_t *) a, vmask = {
+    (1ull << 28) - 1, (1ull << 28) - 1}, vm2 = {
+    0, -1}, tmp = vshr_n_u32(aa[7], 28);
+
+    for (unsigned int i = 7; i >= 1; i--) {
+        aa[i] = vsra_n_u32(aa[i] & vmask, aa[i - 1], 28);
    }
-    aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp&vm2);
+    aa[0] = (aa[0] & vmask) + vrev64_u32(tmp) + (tmp & vm2);
 }
-
--- a/crypto/ec/curve448/arch_ref64/arch_intrinsics.h
+++ b/crypto/ec/curve448/arch_ref64/arch_intrinsics.h
@ -11,20 +11,21 @@
 */

 #ifndef __ARCH_REF64_ARCH_INTRINSICS_H__
-#define __ARCH_REF64_ARCH_INTRINSICS_H__
+# define __ARCH_REF64_ARCH_INTRINSICS_H__

-#define ARCH_WORD_BITS 64
+# define ARCH_WORD_BITS 64

-static __inline__ __attribute((always_inline,unused))
-uint64_t word_is_zero(uint64_t a) {
+static __inline__ __attribute((always_inline, unused))
+uint64_t word_is_zero(uint64_t a)
+{
    /* let's hope the compiler isn't clever enough to optimize this. */
-    return (((__uint128_t)a)-1)>>64;
+    return (((__uint128_t) a) - 1) >> 64;
 }

-static __inline__ __attribute((always_inline,unused))
-__uint128_t widemul(uint64_t a, uint64_t b) {
-    return ((__uint128_t)a) * b; 
+static __inline__ __attribute((always_inline, unused))
+__uint128_t widemul(uint64_t a, uint64_t b)
+{
+    return ((__uint128_t) a) * b;
 }

-#endif /* ARCH_REF64_ARCH_INTRINSICS_H__ */
-
+#endif                          /* ARCH_REF64_ARCH_INTRINSICS_H__ */
--- a/crypto/ec/curve448/arch_ref64/f_impl.c
+++ b/crypto/ec/curve448/arch_ref64/f_impl.c
@ -11,68 +11,69 @@
 */
 #include "f_field.h"

-void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
+void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
+{
    const uint64_t *a = as->limb, *b = bs->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0 = 0, accum1 = 0, accum2;
-    uint64_t mask = (1ull<<56) - 1;  
+    uint64_t mask = (1ull << 56) - 1;

    uint64_t aa[4], bb[4], bbb[4];

    unsigned int i;
-    for (i=0; i<4; i++) {
-        aa[i]  = a[i] + a[i+4];
-        bb[i]  = b[i] + b[i+4];
-        bbb[i] = bb[i] + b[i+4];
+    for (i = 0; i < 4; i++) {
+        aa[i] = a[i] + a[i + 4];
+        bb[i] = b[i] + b[i + 4];
+        bbb[i] = bb[i] + b[i + 4];
    }

    int I_HATE_UNROLLED_LOOPS = 0;

    if (I_HATE_UNROLLED_LOOPS) {
-        /* The compiler probably won't unroll this,
-         * so it's like 80% slower.
+        /*
+         * The compiler probably won't unroll this, so it's like 80% slower.
         */
-        for (i=0; i<4; i++) {
+        for (i = 0; i < 4; i++) {
            accum2 = 0;

            unsigned int j;
-            for (j=0; j<=i; j++) {
-                accum2 += widemul(a[j],   b[i-j]);
-                accum1 += widemul(aa[j], bb[i-j]);
-                accum0 += widemul(a[j+4], b[i-j+4]);
+            for (j = 0; j <= i; j++) {
+                accum2 += widemul(a[j], b[i - j]);
+                accum1 += widemul(aa[j], bb[i - j]);
+                accum0 += widemul(a[j + 4], b[i - j + 4]);
            }
-            for (; j<4; j++) {
-                accum2 += widemul(a[j],   b[i-j+8]);
-                accum1 += widemul(aa[j], bbb[i-j+4]);
-                accum0 += widemul(a[j+4], bb[i-j+4]);
+            for (; j < 4; j++) {
+                accum2 += widemul(a[j], b[i - j + 8]);
+                accum1 += widemul(aa[j], bbb[i - j + 4]);
+                accum0 += widemul(a[j + 4], bb[i - j + 4]);
            }

            accum1 -= accum2;
            accum0 += accum2;

-            c[i]   = ((uint64_t)(accum0)) & mask;
-            c[i+4] = ((uint64_t)(accum1)) & mask;
+            c[i] = ((uint64_t)(accum0)) & mask;
+            c[i + 4] = ((uint64_t)(accum1)) & mask;

            accum0 >>= 56;
            accum1 >>= 56;
        }
    } else {
-        accum2  = widemul(a[0],  b[0]);
+        accum2 = widemul(a[0], b[0]);
        accum1 += widemul(aa[0], bb[0]);
-        accum0 += widemul(a[4],  b[4]);
+        accum0 += widemul(a[4], b[4]);

-        accum2 += widemul(a[1],  b[7]);
+        accum2 += widemul(a[1], b[7]);
        accum1 += widemul(aa[1], bbb[3]);
-        accum0 += widemul(a[5],  bb[3]);
+        accum0 += widemul(a[5], bb[3]);

-        accum2 += widemul(a[2],  b[6]);
+        accum2 += widemul(a[2], b[6]);
        accum1 += widemul(aa[2], bbb[2]);
-        accum0 += widemul(a[6],  bb[2]);
+        accum0 += widemul(a[6], bb[2]);

-        accum2 += widemul(a[3],  b[5]);
+        accum2 += widemul(a[3], b[5]);
        accum1 += widemul(aa[3], bbb[1]);
-        accum0 += widemul(a[7],  bb[1]);
+        accum0 += widemul(a[7], bb[1]);

        accum1 -= accum2;
        accum0 += accum2;
@ -83,21 +84,21 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        accum0 >>= 56;
        accum1 >>= 56;

-        accum2  = widemul(a[0],  b[1]);
+        accum2 = widemul(a[0], b[1]);
        accum1 += widemul(aa[0], bb[1]);
-        accum0 += widemul(a[4],  b[5]);
+        accum0 += widemul(a[4], b[5]);

-        accum2 += widemul(a[1],  b[0]);
+        accum2 += widemul(a[1], b[0]);
        accum1 += widemul(aa[1], bb[0]);
-        accum0 += widemul(a[5],  b[4]);
+        accum0 += widemul(a[5], b[4]);

-        accum2 += widemul(a[2],  b[7]);
+        accum2 += widemul(a[2], b[7]);
        accum1 += widemul(aa[2], bbb[3]);
-        accum0 += widemul(a[6],  bb[3]);
+        accum0 += widemul(a[6], bb[3]);

-        accum2 += widemul(a[3],  b[6]);
+        accum2 += widemul(a[3], b[6]);
        accum1 += widemul(aa[3], bbb[2]);
-        accum0 += widemul(a[7],  bb[2]);
+        accum0 += widemul(a[7], bb[2]);

        accum1 -= accum2;
        accum0 += accum2;
@ -108,21 +109,21 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        accum0 >>= 56;
        accum1 >>= 56;

-        accum2  = widemul(a[0],  b[2]);
+        accum2 = widemul(a[0], b[2]);
        accum1 += widemul(aa[0], bb[2]);
-        accum0 += widemul(a[4],  b[6]);
+        accum0 += widemul(a[4], b[6]);

-        accum2 += widemul(a[1],  b[1]);
+        accum2 += widemul(a[1], b[1]);
        accum1 += widemul(aa[1], bb[1]);
-        accum0 += widemul(a[5],  b[5]);
+        accum0 += widemul(a[5], b[5]);

-        accum2 += widemul(a[2],  b[0]);
+        accum2 += widemul(a[2], b[0]);
        accum1 += widemul(aa[2], bb[0]);
-        accum0 += widemul(a[6],  b[4]);
+        accum0 += widemul(a[6], b[4]);

-        accum2 += widemul(a[3],  b[7]);
+        accum2 += widemul(a[3], b[7]);
        accum1 += widemul(aa[3], bbb[3]);
-        accum0 += widemul(a[7],  bb[3]);
+        accum0 += widemul(a[7], bb[3]);

        accum1 -= accum2;
        accum0 += accum2;
@ -133,21 +134,21 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
        accum0 >>= 56;
        accum1 >>= 56;

-        accum2  = widemul(a[0],  b[3]);
+        accum2 = widemul(a[0], b[3]);
        accum1 += widemul(aa[0], bb[3]);
-        accum0 += widemul(a[4],  b[7]);
+        accum0 += widemul(a[4], b[7]);

-        accum2 += widemul(a[1],  b[2]);
+        accum2 += widemul(a[1], b[2]);
        accum1 += widemul(aa[1], bb[2]);
-        accum0 += widemul(a[5],  b[6]);
+        accum0 += widemul(a[5], b[6]);

-        accum2 += widemul(a[2],  b[1]);
+        accum2 += widemul(a[2], b[1]);
        accum1 += widemul(aa[2], bb[1]);
-        accum0 += widemul(a[6],  b[5]);
+        accum0 += widemul(a[6], b[5]);

-        accum2 += widemul(a[3],  b[0]);
+        accum2 += widemul(a[3], b[0]);
        accum1 += widemul(aa[3], bb[0]);
-        accum0 += widemul(a[7],  b[4]);
+        accum0 += widemul(a[7], b[4]);

        accum1 -= accum2;
        accum0 += accum2;
@ -157,7 +158,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {

        accum0 >>= 56;
        accum1 >>= 56;
-    } /* !I_HATE_UNROLLED_LOOPS */
+    }                           /* !I_HATE_UNROLLED_LOOPS */

    accum0 += accum1;
    accum0 += c[4];
@ -172,21 +173,24 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    c[1] += ((uint64_t)(accum1));
 }

-void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
+void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
+{
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0 = 0, accum4 = 0;
-    uint64_t mask = (1ull<<56) - 1;  
+    uint64_t mask = (1ull << 56) - 1;

    int i;
-    for (i=0; i<4; i++) {
+    for (i = 0; i < 4; i++) {
        accum0 += widemul(b, a[i]);
-        accum4 += widemul(b, a[i+4]);
-        c[i]   = accum0 & mask; accum0 >>= 56;
-        c[i+4] = accum4 & mask; accum4 >>= 56;
+        accum4 += widemul(b, a[i + 4]);
+        c[i] = accum0 & mask;
+        accum0 >>= 56;
+        c[i + 4] = accum4 & mask;
+        accum4 >>= 56;
    }
-    
+
    accum0 += accum4 + c[4];
    c[4] = accum0 & mask;
    c[5] += accum0 >> 56;
@ -196,24 +200,25 @@ void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
    c[1] += accum4 >> 56;
 }

-void gf_sqr (gf_s *__restrict__ cs, const gf as) {
+void gf_sqr(gf_s * __restrict__ cs, const gf as)
+{
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0 = 0, accum1 = 0, accum2;
-    uint64_t mask = (1ull<<56) - 1;  
+    uint64_t mask = (1ull << 56) - 1;

    uint64_t aa[4];

    /* For some reason clang doesn't vectorize this without prompting? */
    unsigned int i;
-    for (i=0; i<4; i++) {
-        aa[i] = a[i] + a[i+4];
+    for (i = 0; i < 4; i++) {
+        aa[i] = a[i] + a[i + 4];
    }

-    accum2  = widemul(a[0],a[3]);
-    accum0  = widemul(aa[0],aa[3]);
-    accum1  = widemul(a[4],a[7]);
+    accum2 = widemul(a[0], a[3]);
+    accum0 = widemul(aa[0], aa[3]);
+    accum1 = widemul(a[4], a[7]);

    accum2 += widemul(a[1], a[2]);
    accum0 += widemul(aa[1], aa[2]);
@ -222,21 +227,21 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
    accum0 -= accum2;
    accum1 += accum2;

-    c[3] = ((uint64_t)(accum1))<<1 & mask;
-    c[7] = ((uint64_t)(accum0))<<1 & mask;
+    c[3] = ((uint64_t)(accum1)) << 1 & mask;
+    c[7] = ((uint64_t)(accum0)) << 1 & mask;

    accum0 >>= 55;
    accum1 >>= 55;

-    accum0 += widemul(2*aa[1],aa[3]);
-    accum1 += widemul(2*a[5], a[7]);
+    accum0 += widemul(2 * aa[1], aa[3]);
+    accum1 += widemul(2 * a[5], a[7]);
    accum0 += widemul(aa[2], aa[2]);
    accum1 += accum0;

-    accum0 -= widemul(2*a[1], a[3]);
+    accum0 -= widemul(2 * a[1], a[3]);
    accum1 += widemul(a[6], a[6]);
-    
-    accum2 = widemul(a[0],a[0]);
+
+    accum2 = widemul(a[0], a[0]);
    accum1 -= accum2;
    accum0 += accum2;

@ -250,16 +255,16 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
    accum0 >>= 56;
    accum1 >>= 56;

-    accum2  = widemul(2*aa[2],aa[3]);
-    accum0 -= widemul(2*a[2], a[3]);
-    accum1 += widemul(2*a[6], a[7]);
+    accum2 = widemul(2 * aa[2], aa[3]);
+    accum0 -= widemul(2 * a[2], a[3]);
+    accum1 += widemul(2 * a[6], a[7]);

    accum1 += accum2;
    accum0 += accum2;

-    accum2  = widemul(2*a[0],a[1]);
-    accum1 += widemul(2*aa[0], aa[1]);
-    accum0 += widemul(2*a[4], a[5]);
+    accum2 = widemul(2 * a[0], a[1]);
+    accum1 += widemul(2 * aa[0], aa[1]);
+    accum0 += widemul(2 * a[4], a[5]);

    accum1 -= accum2;
    accum0 += accum2;
@ -270,16 +275,16 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
    accum0 >>= 56;
    accum1 >>= 56;

-    accum2  = widemul(aa[3],aa[3]);
+    accum2 = widemul(aa[3], aa[3]);
    accum0 -= widemul(a[3], a[3]);
    accum1 += widemul(a[7], a[7]);

    accum1 += accum2;
    accum0 += accum2;

-    accum2  = widemul(2*a[0],a[2]);
-    accum1 += widemul(2*aa[0], aa[2]);
-    accum0 += widemul(2*a[4], a[6]);
+    accum2 = widemul(2 * a[0], a[2]);
+    accum1 += widemul(2 * aa[0], aa[2]);
+    accum0 += widemul(2 * a[4], a[6]);

    accum2 += widemul(a[1], a[1]);
    accum1 += widemul(aa[1], aa[1]);
@ -306,4 +311,3 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
    c[4] += ((uint64_t)(accum0)) + ((uint64_t)(accum1));
    c[0] += ((uint64_t)(accum1));
 }
-
--- a/crypto/ec/curve448/arch_ref64/f_impl.h
+++ b/crypto/ec/curve448/arch_ref64/f_impl.h
@ -10,37 +10,41 @@
 * Originally written by Mike Hamburg
 */

-#define GF_HEADROOM 9999 /* Everything is reduced anyway */
+#define GF_HEADROOM 9999        /* Everything is reduced anyway */
 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
-    
+
 #define LIMB_PLACE_VALUE(i) 56

-void gf_add_RAW (gf out, const gf a, const gf b) {
-    for (unsigned int i=0; i<8; i++) {
+void gf_add_RAW(gf out, const gf a, const gf b)
+{
+    for (unsigned int i = 0; i < 8; i++) {
        out->limb[i] = a->limb[i] + b->limb[i];
    }
    gf_weak_reduce(out);
 }

-void gf_sub_RAW (gf out, const gf a, const gf b) {
-    uint64_t co1 = ((1ull<<56)-1)*2, co2 = co1-2;
-    for (unsigned int i=0; i<8; i++) {
-        out->limb[i] = a->limb[i] - b->limb[i] + ((i==4) ? co2 : co1);
+void gf_sub_RAW(gf out, const gf a, const gf b)
+{
+    uint64_t co1 = ((1ull << 56) - 1) * 2, co2 = co1 - 2;
+    for (unsigned int i = 0; i < 8; i++) {
+        out->limb[i] = a->limb[i] - b->limb[i] + ((i == 4) ? co2 : co1);
    }
    gf_weak_reduce(out);
 }

-void gf_bias (gf a, int amt) {
-    (void) a;
-    (void) amt;
+void gf_bias(gf a, int amt)
+{
+    (void)a;
+    (void)amt;
 }

-void gf_weak_reduce (gf a) {
-    uint64_t mask = (1ull<<56) - 1;
+void gf_weak_reduce(gf a)
+{
+    uint64_t mask = (1ull << 56) - 1;
    uint64_t tmp = a->limb[7] >> 56;
    a->limb[4] += tmp;
-    for (unsigned int i=7; i>0; i--) {
-        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>56);
+    for (unsigned int i = 7; i > 0; i--) {
+        a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56);
    }
    a->limb[0] = (a->limb[0] & mask) + tmp;
 }
--- a/crypto/ec/curve448/arch_x86_64/arch_intrinsics.h
+++ b/crypto/ec/curve448/arch_x86_64/arch_intrinsics.h
@ -10,303 +10,292 @@
 * Originally written by Mike Hamburg
 */
 #ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__
-#define __ARCH_X86_64_ARCH_INTRINSICS_H__
+# define __ARCH_X86_64_ARCH_INTRINSICS_H__

-#define ARCH_WORD_BITS 64
+# define ARCH_WORD_BITS 64

-#include <openssl/e_os2.h>
+# include <openssl/e_os2.h>

 /* FUTURE: autogenerate */
-static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
-  uint64_t c,d;
-  #ifndef __BMI2__
-      __asm__ volatile
-          ("movq %[a], %%rax;"
-           "mulq %[b];"
-           : [c]"=&a"(c), [d]"=d"(d)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "cc");
-  #else
-      __asm__ volatile
-          ("movq %[a], %%rdx;"
-           "mulx %[b], %[c], %[d];"
-           : [c]"=r"(c), [d]"=r"(d)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rdx");
-  #endif
-  return (((__uint128_t)(d))<<64) | c;
+static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b)
+{
+    uint64_t c, d;
+# ifndef __BMI2__
+    __asm__ volatile
+     ("movq %[a], %%rax;" "mulq %[b];":[c] "=&a"(c),[d] "=d"(d)
+      :[b] "m"(*b),[a] "m"(*a)
+      :"cc");
+# else
+    __asm__ volatile
+     ("movq %[a], %%rdx;" "mulx %[b], %[c], %[d];":[c] "=r"(c),[d] "=r"(d)
+      :[b] "m"(*b),[a] "m"(*a)
+      :"rdx");
+# endif
+    return (((__uint128_t) (d)) << 64) | c;
 }

-static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
-  uint64_t c,d;
-  #ifndef __BMI2__
-      __asm__ volatile
-          ("movq %[a], %%rax;"
-           "mulq %[b];"
-           : [c]"=&a"(c), [d]"=d"(d)
-           : [b]"m"(*b), [a]"r"(a)
-           : "cc");
-  #else
-      __asm__ volatile
-          ("mulx %[b], %[c], %[d];"
-           : [c]"=r"(c), [d]"=r"(d)
-           : [b]"m"(*b), [a]"d"(a));
-  #endif
-  return (((__uint128_t)(d))<<64) | c;
+static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b)
+{
+    uint64_t c, d;
+# ifndef __BMI2__
+    __asm__ volatile
+     ("movq %[a], %%rax;" "mulq %[b];":[c] "=&a"(c),[d] "=d"(d)
+      :[b] "m"(*b),[a] "r"(a)
+      :"cc");
+# else
+    __asm__ volatile
+     ("mulx %[b], %[c], %[d];":[c] "=r"(c),[d] "=r"(d)
+      :[b] "m"(*b),[a] "d"(a));
+# endif
+    return (((__uint128_t) (d)) << 64) | c;
 }

-static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) {
-  uint64_t c,d;
-  #ifndef __BMI2__
-      __asm__ volatile
-          ("mulq %[b];"
-           : [c]"=a"(c), [d]"=d"(d)
-           : [b]"r"(b), "a"(a)
-           : "cc");
-  #else
-      __asm__ volatile
-          ("mulx %[b], %[c], %[d];"
-           : [c]"=r"(c), [d]"=r"(d)
-           : [b]"r"(b), [a]"d"(a));
-  #endif
-  return (((__uint128_t)(d))<<64) | c;
+static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b)
+{
+    uint64_t c, d;
+# ifndef __BMI2__
+    __asm__ volatile
+     ("mulq %[b];":[c] "=a"(c),[d] "=d"(d)
+      :[b] "r"(b), "a"(a)
+      :"cc");
+# else
+    __asm__ volatile
+     ("mulx %[b], %[c], %[d];":[c] "=r"(c),[d] "=r"(d)
+      :[b] "r"(b),[a] "d"(a));
+# endif
+    return (((__uint128_t) (d)) << 64) | c;
 }

-static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
-  uint64_t c,d;
-  #ifndef __BMI2__
-      __asm__ volatile
-          ("movq %[a], %%rax; "
-           "addq %%rax, %%rax; "
-           "mulq %[b];"
-           : [c]"=&a"(c), [d]"=d"(d)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "cc");
-  #else
-      __asm__ volatile
-          ("movq %[a], %%rdx;"
-           "leaq (,%%rdx,2), %%rdx;"
-           "mulx %[b], %[c], %[d];"
-           : [c]"=r"(c), [d]"=r"(d)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rdx");
-  #endif
-  return (((__uint128_t)(d))<<64) | c;
+static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b)
+{
+    uint64_t c, d;
+# ifndef __BMI2__
+    __asm__ volatile
+     ("movq %[a], %%rax; "
+      "addq %%rax, %%rax; " "mulq %[b];":[c] "=&a"(c),[d] "=d"(d)
+      :[b] "m"(*b),[a] "m"(*a)
+      :"cc");
+# else
+    __asm__ volatile
+     ("movq %[a], %%rdx;"
+      "leaq (,%%rdx,2), %%rdx;" "mulx %[b], %[c], %[d];":[c] "=r"(c),[d] "=r"(d)
+      :[b] "m"(*b),[a] "m"(*a)
+      :"rdx");
+# endif
+    return (((__uint128_t) (d)) << 64) | c;
 }

-static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  
-  #ifdef __BMI2__
-      uint64_t c,d;
-      __asm__ volatile
-          ("movq %[a], %%rdx; "
-           "mulx %[b], %[c], %[d]; "
-           "addq %[c], %[lo]; "
-           "adcq %[d], %[hi]; "
-           : [c]"=&r"(c), [d]"=&r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rdx", "cc");
-  #else
-      __asm__ volatile
-          ("movq %[a], %%rax; "
-           "mulq %[b]; "
-           "addq %%rax, %[lo]; "
-           "adcq %%rdx, %[hi]; "
-           : [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
+static __inline__ void mac(__uint128_t * acc, const uint64_t *a,
+                           const uint64_t *b)
+{
+    uint64_t lo = *acc, hi = *acc >> 64;
+
+# ifdef __BMI2__
+    uint64_t c, d;
+    __asm__ volatile
+     ("movq %[a], %%rdx; "
+      "mulx %[b], %[c], %[d]; "
+      "addq %[c], %[lo]; "
+      "adcq %[d], %[hi]; ":[c] "=&r"(c),[d] "=&r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
+      :[b] "m"(*b),[a] "m"(*a)
+      :"rdx", "cc");
+# else
+    __asm__ volatile
+     ("movq %[a], %%rax; "
+      "mulq %[b]; "
+      "addq %%rax, %[lo]; " "adcq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi)
+      :[b] "m"(*b),[a] "m"(*a)
+      :"rax", "rdx", "cc");
+# endif
+
+    *acc = (((__uint128_t) (hi)) << 64) | lo;
 }

-static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  uint64_t lo2 = *acc2, hi2 = *acc2>>64;
-  
-  #ifdef __BMI2__
-      uint64_t c,d;
-      __asm__ volatile
-          ("movq %[a], %%rdx; "
-           "mulx %[b], %[c], %[d]; "
-           "addq %[c], %[lo]; "
-           "adcq %[d], %[hi]; "
-           "addq %[c], %[lo2]; "
-           "adcq %[d], %[hi2]; "
-           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rdx", "cc");
-  #else
-      __asm__ volatile
-          ("movq %[a], %%rax; "
-           "mulq %[b]; "
-           "addq %%rax, %[lo]; "
-           "adcq %%rdx, %[hi]; "
-           "addq %%rax, %[lo2]; "
-           "adcq %%rdx, %[hi2]; "
-           : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-  *acc2 = (((__uint128_t)(hi2))<<64) | lo2;
+static __inline__ void macac(__uint128_t * acc, __uint128_t * acc2,
+                             const uint64_t *a, const uint64_t *b)
+{
+    uint64_t lo = *acc, hi = *acc >> 64;
+    uint64_t lo2 = *acc2, hi2 = *acc2 >> 64;
+
+# ifdef __BMI2__
+    uint64_t c, d;
+    __asm__ volatile
+     ("movq %[a], %%rdx; "
+      "mulx %[b], %[c], %[d]; "
+      "addq %[c], %[lo]; "
+      "adcq %[d], %[hi]; "
+      "addq %[c], %[lo2]; "
+      "adcq %[d], %[hi2]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi),
+      [lo2] "+r"(lo2),[hi2] "+r"(hi2)
+      :[b] "m"(*b),[a] "m"(*a)
+      :"rdx", "cc");
+# else
+    __asm__ volatile
+     ("movq %[a], %%rax; "
+      "mulq %[b]; "
+      "addq %%rax, %[lo]; "
+      "adcq %%rdx, %[hi]; "
+      "addq %%rax, %[lo2]; "
+      "adcq %%rdx, %[hi2]; ":[lo] "+r"(lo),[hi] "+r"(hi),[lo2] "+r"(lo2),
+      [hi2] "+r"(hi2)
+      :[b] "m"(*b),[a] "m"(*a)
+      :"rax", "rdx", "cc");
+# endif
+
+    *acc = (((__uint128_t) (hi)) << 64) | lo;
+    *acc2 = (((__uint128_t) (hi2)) << 64) | lo2;
 }

-static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  
-  #ifdef __BMI2__
-      uint64_t c,d;
-      __asm__ volatile
-          ("mulx %[b], %[c], %[d]; "
-           "addq %[c], %[lo]; "
-           "adcq %[d], %[hi]; "
-           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"d"(a)
-           : "cc");
-  #else
-      __asm__ volatile
-          ("movq %[a], %%rax; "
-           "mulq %[b]; "
-           "addq %%rax, %[lo]; "
-           "adcq %%rdx, %[hi]; "
-           : [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"r"(a)
-           : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
+static __inline__ void mac_rm(__uint128_t * acc, uint64_t a, const uint64_t *b)
+{
+    uint64_t lo = *acc, hi = *acc >> 64;
+
+# ifdef __BMI2__
+    uint64_t c, d;
+    __asm__ volatile
+     ("mulx %[b], %[c], %[d]; "
+      "addq %[c], %[lo]; "
+      "adcq %[d], %[hi]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
+      :[b] "m"(*b),[a] "d"(a)
+      :"cc");
+# else
+    __asm__ volatile
+     ("movq %[a], %%rax; "
+      "mulq %[b]; "
+      "addq %%rax, %[lo]; " "adcq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi)
+      :[b] "m"(*b),[a] "r"(a)
+      :"rax", "rdx", "cc");
+# endif
+
+    *acc = (((__uint128_t) (hi)) << 64) | lo;
 }

-static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  
-  #ifdef __BMI2__
-      uint64_t c,d;
-      __asm__ volatile
-          ("mulx %[b], %[c], %[d]; "
-           "addq %[c], %[lo]; "
-           "adcq %[d], %[hi]; "
-           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"r"(b), [a]"d"(a)
-           : "cc");
-  #else
-      __asm__ volatile
-          ("mulq %[b]; "
-           "addq %%rax, %[lo]; "
-           "adcq %%rdx, %[hi]; "
-           : [lo]"+r"(lo), [hi]"+r"(hi), "+a"(a)
-           : [b]"r"(b)
-           : "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
+static __inline__ void mac_rr(__uint128_t * acc, uint64_t a, const uint64_t b)
+{
+    uint64_t lo = *acc, hi = *acc >> 64;
+
+# ifdef __BMI2__
+    uint64_t c, d;
+    __asm__ volatile
+     ("mulx %[b], %[c], %[d]; "
+      "addq %[c], %[lo]; "
+      "adcq %[d], %[hi]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
+      :[b] "r"(b),[a] "d"(a)
+      :"cc");
+# else
+    __asm__ volatile
+     ("mulq %[b]; "
+      "addq %%rax, %[lo]; "
+      "adcq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi), "+a"(a)
+      :[b] "r"(b)
+      :"rdx", "cc");
+# endif
+
+    *acc = (((__uint128_t) (hi)) << 64) | lo;
 }

-static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  
-  #ifdef __BMI2__
-      uint64_t c,d;
-      __asm__ volatile
-          ("movq %[a], %%rdx; "
-           "addq %%rdx, %%rdx; "
-           "mulx %[b], %[c], %[d]; "
-           "addq %[c], %[lo]; "
-           "adcq %[d], %[hi]; "
-           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rdx", "cc");
-  #else
-      __asm__ volatile
-          ("movq %[a], %%rax; "
-           "addq %%rax, %%rax; "
-           "mulq %[b]; "
-           "addq %%rax, %[lo]; "
-           "adcq %%rdx, %[hi]; "
-           : [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
+static __inline__ void mac2(__uint128_t * acc, const uint64_t *a,
+                            const uint64_t *b)
+{
+    uint64_t lo = *acc, hi = *acc >> 64;
+
+# ifdef __BMI2__
+    uint64_t c, d;
+    __asm__ volatile
+     ("movq %[a], %%rdx; "
+      "addq %%rdx, %%rdx; "
+      "mulx %[b], %[c], %[d]; "
+      "addq %[c], %[lo]; "
+      "adcq %[d], %[hi]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
+      :[b] "m"(*b),[a] "m"(*a)
+      :"rdx", "cc");
+# else
+    __asm__ volatile
+     ("movq %[a], %%rax; "
+      "addq %%rax, %%rax; "
+      "mulq %[b]; "
+      "addq %%rax, %[lo]; " "adcq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi)
+      :[b] "m"(*b),[a] "m"(*a)
+      :"rax", "rdx", "cc");
+# endif
+
+    *acc = (((__uint128_t) (hi)) << 64) | lo;
 }

-static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  #ifdef __BMI2__
-      uint64_t c,d;
-      __asm__ volatile
-          ("movq %[a], %%rdx; "
-           "mulx %[b], %[c], %[d]; "
-           "subq %[c], %[lo]; "
-           "sbbq %[d], %[hi]; "
-           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rdx", "cc");
-  #else
-      __asm__ volatile
-          ("movq %[a], %%rax; "
-           "mulq %[b]; "
-           "subq %%rax, %[lo]; "
-           "sbbq %%rdx, %[hi]; "
-           : [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rax", "rdx", "cc");
-  #endif
-  *acc = (((__uint128_t)(hi))<<64) | lo;
+static __inline__ void msb(__uint128_t * acc, const uint64_t *a,
+                           const uint64_t *b)
+{
+    uint64_t lo = *acc, hi = *acc >> 64;
+# ifdef __BMI2__
+    uint64_t c, d;
+    __asm__ volatile
+     ("movq %[a], %%rdx; "
+      "mulx %[b], %[c], %[d]; "
+      "subq %[c], %[lo]; "
+      "sbbq %[d], %[hi]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
+      :[b] "m"(*b),[a] "m"(*a)
+      :"rdx", "cc");
+# else
+    __asm__ volatile
+     ("movq %[a], %%rax; "
+      "mulq %[b]; "
+      "subq %%rax, %[lo]; " "sbbq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi)
+      :[b] "m"(*b),[a] "m"(*a)
+      :"rax", "rdx", "cc");
+# endif
+    *acc = (((__uint128_t) (hi)) << 64) | lo;
 }

-static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  #ifdef __BMI2__
-      uint64_t c,d;
-      __asm__ volatile
-          ("movq %[a], %%rdx; "
-           "addq %%rdx, %%rdx; "
-           "mulx %[b], %[c], %[d]; "
-           "subq %[c], %[lo]; "
-           "sbbq %[d], %[hi]; "
-           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rdx", "cc");
-  #else
-      __asm__ volatile
-          ("movq %[a], %%rax; "
-           "addq %%rax, %%rax; "
-           "mulq %[b]; "
-           "subq %%rax, %[lo]; "
-           "sbbq %%rdx, %[hi]; "
-           : [lo]"+r"(lo), [hi]"+r"(hi)
-           : [b]"m"(*b), [a]"m"(*a)
-           : "rax", "rdx", "cc");
-  #endif
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-  
+static __inline__ void msb2(__uint128_t * acc, const uint64_t *a,
+                            const uint64_t *b)
+{
+    uint64_t lo = *acc, hi = *acc >> 64;
+# ifdef __BMI2__
+    uint64_t c, d;
+    __asm__ volatile
+     ("movq %[a], %%rdx; "
+      "addq %%rdx, %%rdx; "
+      "mulx %[b], %[c], %[d]; "
+      "subq %[c], %[lo]; "
+      "sbbq %[d], %[hi]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
+      :[b] "m"(*b),[a] "m"(*a)
+      :"rdx", "cc");
+# else
+    __asm__ volatile
+     ("movq %[a], %%rax; "
+      "addq %%rax, %%rax; "
+      "mulq %[b]; "
+      "subq %%rax, %[lo]; " "sbbq %%rdx, %[hi]; ":[lo] "+r"(lo),[hi] "+r"(hi)
+      :[b] "m"(*b),[a] "m"(*a)
+      :"rax", "rdx", "cc");
+# endif
+    *acc = (((__uint128_t) (hi)) << 64) | lo;
+
 }

-static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t c,d, lo = *acc, hi = *acc>>64;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "subq %[lo], %[c]; "
-       "sbbq %[hi], %[d]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  *acc = (((__uint128_t)(d))<<64) | c;
+static __inline__ void mrs(__uint128_t * acc, const uint64_t *a,
+                           const uint64_t *b)
+{
+    uint64_t c, d, lo = *acc, hi = *acc >> 64;
+    __asm__ volatile
+     ("movq %[a], %%rdx; "
+      "mulx %[b], %[c], %[d]; "
+      "subq %[lo], %[c]; "
+      "sbbq %[hi], %[d]; ":[c] "=r"(c),[d] "=r"(d),[lo] "+r"(lo),[hi] "+r"(hi)
+      :[b] "m"(*b),[a] "m"(*a)
+      :"rdx", "cc");
+    *acc = (((__uint128_t) (d)) << 64) | c;
 }

-static __inline__ uint64_t word_is_zero(uint64_t x) {
-  __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
-  return ~x;
+static __inline__ uint64_t word_is_zero(uint64_t x)
+{
+    __asm__ volatile ("neg %0; sbb %0, %0;":"+r" (x));
+    return ~x;
 }

-static inline uint64_t shrld(__uint128_t x, int n) {
-    return x>>n;
+static inline uint64_t shrld(__uint128_t x, int n)
+{
+    return x >> n;
 }

-#endif /* __ARCH_X86_64_ARCH_INTRINSICS_H__ */
+#endif                          /* __ARCH_X86_64_ARCH_INTRINSICS_H__ */
--- a/crypto/ec/curve448/arch_x86_64/f_impl.c
+++ b/crypto/ec/curve448/arch_x86_64/f_impl.c
@ -12,32 +12,34 @@

 #include "f_field.h"

-void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
+void gf_mul(gf_s * __restrict__ cs, const gf as, const gf bs)
+{
    const uint64_t *a = as->limb, *b = bs->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0 = 0, accum1 = 0, accum2;
-    uint64_t mask = (1ull<<56) - 1;  
+    uint64_t mask = (1ull << 56) - 1;

    uint64_t aa[4] VECTOR_ALIGNED, bb[4] VECTOR_ALIGNED, bbb[4] VECTOR_ALIGNED;

    /* For some reason clang doesn't vectorize this without prompting? */
    unsigned int i;
-    for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
-        ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
-        ((uint64xn_t*)bb)[i] = ((const uint64xn_t*)b)[i] + ((const uint64xn_t*)(&b[4]))[i]; 
-        ((uint64xn_t*)bbb)[i] = ((const uint64xn_t*)bb)[i] + ((const uint64xn_t*)(&b[4]))[i];     
+    for (i = 0; i < sizeof(aa) / sizeof(uint64xn_t); i++) {
+        ((uint64xn_t *) aa)[i] =
+            ((const uint64xn_t *)a)[i] + ((const uint64xn_t *)(&a[4]))[i];
+        ((uint64xn_t *) bb)[i] =
+            ((const uint64xn_t *)b)[i] + ((const uint64xn_t *)(&b[4]))[i];
+        ((uint64xn_t *) bbb)[i] =
+            ((const uint64xn_t *)bb)[i] + ((const uint64xn_t *)(&b[4]))[i];
    }
    /*
-    for (int i=0; i<4; i++) {
-    aa[i] = a[i] + a[i+4];
-    bb[i] = b[i] + b[i+4];
-    }
-    */
+     * for (int i=0; i<4; i++) { aa[i] = a[i] + a[i+4]; bb[i] = b[i] + b[i+4];
+     * }
+     */

-    accum2  = widemul(&a[0],&b[3]);
-    accum0  = widemul(&aa[0],&bb[3]);
-    accum1  = widemul(&a[4],&b[7]);
+    accum2 = widemul(&a[0], &b[3]);
+    accum0 = widemul(&aa[0], &bb[3]);
+    accum1 = widemul(&a[4], &b[7]);

    mac(&accum2, &a[1], &b[2]);
    mac(&accum0, &aa[1], &bb[2]);
@ -59,18 +61,18 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {

    accum0 >>= 56;
    accum1 >>= 56;
-    
-    mac(&accum0, &aa[1],&bb[3]);
+
+    mac(&accum0, &aa[1], &bb[3]);
    mac(&accum1, &a[5], &b[7]);
    mac(&accum0, &aa[2], &bb[2]);
    mac(&accum1, &a[6], &b[6]);
    mac(&accum0, &aa[3], &bb[1]);
    accum1 += accum0;

-    accum2 = widemul(&a[0],&b[0]);
+    accum2 = widemul(&a[0], &b[0]);
    accum1 -= accum2;
    accum0 += accum2;
-    
+
    msb(&accum0, &a[1], &b[3]);
    msb(&accum0, &a[2], &b[2]);
    mac(&accum1, &a[7], &b[5]);
@ -84,7 +86,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    accum0 >>= 56;
    accum1 >>= 56;

-    accum2  = widemul(&a[2],&b[7]);
+    accum2 = widemul(&a[2], &b[7]);
    mac(&accum0, &a[6], &bb[3]);
    mac(&accum1, &aa[2], &bbb[3]);

@ -92,7 +94,7 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    mac(&accum0, &a[7], &bb[2]);
    mac(&accum1, &aa[3], &bbb[2]);

-    mac(&accum2, &a[0],&b[1]);
+    mac(&accum2, &a[0], &b[1]);
    mac(&accum1, &aa[0], &bb[1]);
    mac(&accum0, &a[4], &b[5]);

@ -109,11 +111,11 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    accum0 >>= 56;
    accum1 >>= 56;

-    accum2  = widemul(&a[3],&b[7]);
+    accum2 = widemul(&a[3], &b[7]);
    mac(&accum0, &a[7], &bb[3]);
    mac(&accum1, &aa[3], &bbb[3]);

-    mac(&accum2, &a[0],&b[2]);
+    mac(&accum2, &a[0], &b[2]);
    mac(&accum1, &aa[0], &bb[2]);
    mac(&accum0, &a[4], &b[6]);

@ -147,37 +149,46 @@ void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
    c[0] += ((uint64_t)(accum1));
 }

-void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
+void gf_mulw_unsigned(gf_s * __restrict__ cs, const gf as, uint32_t b)
+{
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0, accum4;
-    uint64_t mask = (1ull<<56) - 1;  
+    uint64_t mask = (1ull << 56) - 1;

    accum0 = widemul_rm(b, &a[0]);
    accum4 = widemul_rm(b, &a[4]);

-    c[0] = accum0 & mask; accum0 >>= 56;
-    c[4] = accum4 & mask; accum4 >>= 56;
+    c[0] = accum0 & mask;
+    accum0 >>= 56;
+    c[4] = accum4 & mask;
+    accum4 >>= 56;

    mac_rm(&accum0, b, &a[1]);
    mac_rm(&accum4, b, &a[5]);

-    c[1] = accum0 & mask; accum0 >>= 56;
-    c[5] = accum4 & mask; accum4 >>= 56;
+    c[1] = accum0 & mask;
+    accum0 >>= 56;
+    c[5] = accum4 & mask;
+    accum4 >>= 56;

    mac_rm(&accum0, b, &a[2]);
    mac_rm(&accum4, b, &a[6]);

-    c[2] = accum0 & mask; accum0 >>= 56;
-    c[6] = accum4 & mask; accum4 >>= 56;
+    c[2] = accum0 & mask;
+    accum0 >>= 56;
+    c[6] = accum4 & mask;
+    accum4 >>= 56;

    mac_rm(&accum0, b, &a[3]);
    mac_rm(&accum4, b, &a[7]);

-    c[3] = accum0 & mask; accum0 >>= 56;
-    c[7] = accum4 & mask; accum4 >>= 56;
-    
+    c[3] = accum0 & mask;
+    accum0 >>= 56;
+    c[7] = accum4 & mask;
+    accum4 >>= 56;
+
    accum0 += accum4 + c[4];
    c[4] = accum0 & mask;
    c[5] += accum0 >> 56;
@ -187,24 +198,26 @@ void gf_mulw_unsigned (gf_s *__restrict__ cs, const gf as, uint32_t b) {
    c[1] += accum4 >> 56;
 }

-void gf_sqr (gf_s *__restrict__ cs, const gf as) {
+void gf_sqr(gf_s * __restrict__ cs, const gf as)
+{
    const uint64_t *a = as->limb;
    uint64_t *c = cs->limb;

    __uint128_t accum0 = 0, accum1 = 0, accum2;
-    uint64_t mask = (1ull<<56) - 1;  
+    uint64_t mask = (1ull << 56) - 1;

    uint64_t aa[4] VECTOR_ALIGNED;

    /* For some reason clang doesn't vectorize this without prompting? */
    unsigned int i;
-    for (i=0; i<sizeof(aa)/sizeof(uint64xn_t); i++) {
-      ((uint64xn_t*)aa)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)(&a[4]))[i];
+    for (i = 0; i < sizeof(aa) / sizeof(uint64xn_t); i++) {
+        ((uint64xn_t *) aa)[i] =
+            ((const uint64xn_t *)a)[i] + ((const uint64xn_t *)(&a[4]))[i];
    }

-    accum2  = widemul(&a[0],&a[3]);
-    accum0  = widemul(&aa[0],&aa[3]);
-    accum1  = widemul(&a[4],&a[7]);
+    accum2 = widemul(&a[0], &a[3]);
+    accum0 = widemul(&aa[0], &aa[3]);
+    accum1 = widemul(&a[4], &a[7]);

    mac(&accum2, &a[1], &a[2]);
    mac(&accum0, &aa[1], &aa[2]);
@ -213,21 +226,21 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
    accum0 -= accum2;
    accum1 += accum2;

-    c[3] = ((uint64_t)(accum1))<<1 & mask;
-    c[7] = ((uint64_t)(accum0))<<1 & mask;
+    c[3] = ((uint64_t)(accum1)) << 1 & mask;
+    c[7] = ((uint64_t)(accum0)) << 1 & mask;

    accum0 >>= 55;
    accum1 >>= 55;

-    mac2(&accum0, &aa[1],&aa[3]);
+    mac2(&accum0, &aa[1], &aa[3]);
    mac2(&accum1, &a[5], &a[7]);
    mac(&accum0, &aa[2], &aa[2]);
    accum1 += accum0;

    msb2(&accum0, &a[1], &a[3]);
    mac(&accum1, &a[6], &a[6]);
-    
-    accum2 = widemul(&a[0],&a[0]);
+
+    accum2 = widemul(&a[0], &a[0]);
    accum1 -= accum2;
    accum0 += accum2;

@ -241,14 +254,14 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
    accum0 >>= 56;
    accum1 >>= 56;

-    accum2  = widemul2(&aa[2],&aa[3]);
+    accum2 = widemul2(&aa[2], &aa[3]);
    msb2(&accum0, &a[2], &a[3]);
    mac2(&accum1, &a[6], &a[7]);

    accum1 += accum2;
    accum0 += accum2;

-    accum2  = widemul2(&a[0],&a[1]);
+    accum2 = widemul2(&a[0], &a[1]);
    mac2(&accum1, &aa[0], &aa[1]);
    mac2(&accum0, &a[4], &a[5]);

@ -261,14 +274,14 @@ void gf_sqr (gf_s *__restrict__ cs, const gf as) {
    accum0 >>= 56;
    accum1 >>= 56;

-    accum2  = widemul(&aa[3],&aa[3]);
+    accum2 = widemul(&aa[3], &aa[3]);
    msb(&accum0, &a[3], &a[3]);
    mac(&accum1, &a[7], &a[7]);

    accum1 += accum2;
    accum0 += accum2;

-    accum2  = widemul2(&a[0],&a[2]);
+    accum2 = widemul2(&a[0], &a[2]);
    mac2(&accum1, &aa[0], &aa[2]);
    mac2(&accum0, &a[4], &a[6]);

--- a/crypto/ec/curve448/arch_x86_64/f_impl.h
+++ b/crypto/ec/curve448/arch_x86_64/f_impl.h
@ -14,60 +14,63 @@
 #define FIELD_LITERAL(a,b,c,d,e,f,g,h) {{a,b,c,d,e,f,g,h}}
 #define LIMB_PLACE_VALUE(i) 56

-void gf_add_RAW (gf out, const gf a, const gf b) {
-    for (unsigned int i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
-        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] + ((const uint64xn_t*)b)[i];
+void gf_add_RAW(gf out, const gf a, const gf b)
+{
+    for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint64xn_t); i++) {
+        ((uint64xn_t *) out)[i] =
+            ((const uint64xn_t *)a)[i] + ((const uint64xn_t *)b)[i];
    }
    /*
-    unsigned int i;
-    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
-        out->limb[i] = a->limb[i] + b->limb[i];
-    }
-    */
+     * unsigned int i; for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+     * out->limb[i] = a->limb[i] + b->limb[i]; }
+     */
 }

-void gf_sub_RAW (gf out, const gf a, const gf b) {
-    for (unsigned int i=0; i<sizeof(*out)/sizeof(uint64xn_t); i++) {
-        ((uint64xn_t*)out)[i] = ((const uint64xn_t*)a)[i] - ((const uint64xn_t*)b)[i];
+void gf_sub_RAW(gf out, const gf a, const gf b)
+{
+    for (unsigned int i = 0; i < sizeof(*out) / sizeof(uint64xn_t); i++) {
+        ((uint64xn_t *) out)[i] =
+            ((const uint64xn_t *)a)[i] - ((const uint64xn_t *)b)[i];
    }
    /*
-    unsigned int i;
-    for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
-        out->limb[i] = a->limb[i] - b->limb[i];
-    }
-    */
+     * unsigned int i; for (i=0; i<sizeof(*out)/sizeof(out->limb[0]); i++) {
+     * out->limb[i] = a->limb[i] - b->limb[i]; }
+     */
 }

-void gf_bias (gf a, int amt) {
-    uint64_t co1 = ((1ull<<56)-1)*amt, co2 = co1-amt;
-    
+void gf_bias(gf a, int amt)
+{
+    uint64_t co1 = ((1ull << 56) - 1) * amt, co2 = co1 - amt;
+
 #if __AVX2__
-    uint64x4_t lo = {co1,co1,co1,co1}, hi = {co2,co1,co1,co1};
-    uint64x4_t *aa = (uint64x4_t*) a;
+    uint64x4_t lo = { co1, co1, co1, co1 }, hi = {
+    co2, co1, co1, co1};
+    uint64x4_t *aa = (uint64x4_t *) a;
    aa[0] += lo;
    aa[1] += hi;
 #elif __SSE2__
-    uint64x2_t lo = {co1,co1}, hi = {co2,co1};
-    uint64x2_t *aa = (uint64x2_t*) a;
+    uint64x2_t lo = { co1, co1 }, hi = {
+    co2, co1};
+    uint64x2_t *aa = (uint64x2_t *) a;
    aa[0] += lo;
    aa[1] += lo;
    aa[2] += hi;
    aa[3] += lo;
 #else
-    for (unsigned int i=0; i<sizeof(*a)/sizeof(uint64_t); i++) {
-        a->limb[i] += (i==4) ? co2 : co1;
+    for (unsigned int i = 0; i < sizeof(*a) / sizeof(uint64_t); i++) {
+        a->limb[i] += (i == 4) ? co2 : co1;
    }
 #endif
 }

-void gf_weak_reduce (gf a) {
+void gf_weak_reduce(gf a)
+{
    /* PERF: use pshufb/palignr if anyone cares about speed of this */
-    uint64_t mask = (1ull<<56) - 1;
+    uint64_t mask = (1ull << 56) - 1;
    uint64_t tmp = a->limb[7] >> 56;
    a->limb[4] += tmp;
-    for (unsigned int i=7; i>0; i--) {
-        a->limb[i] = (a->limb[i] & mask) + (a->limb[i-1]>>56);
+    for (unsigned int i = 7; i > 0; i--) {
+        a->limb[i] = (a->limb[i] & mask) + (a->limb[i - 1] >> 56);
    }
    a->limb[0] = (a->limb[0] & mask) + tmp;
 }
-
--- a/crypto/ec/curve448/constant_time.h
+++ b/crypto/ec/curve448/constant_time.h
@ -11,10 +11,10 @@
 */

 #ifndef __CONSTANT_TIME_H__
-#define __CONSTANT_TIME_H__ 1
+# define __CONSTANT_TIME_H__ 1

-#include "word.h"
-#include <string.h>
+# include "word.h"
+# include <string.h>

 /*
 * Constant-time operations on hopefully-compile-time-sized memory
@ -36,20 +36,19 @@
 * Instead, we're putting our trust in the loop unroller and unswitcher.
 */

-
 /**
 * Unaligned big (vector?) register.
 */
 typedef struct {
    big_register_t unaligned;
-} __attribute__((packed)) unaligned_br_t;
+} __attribute__ ((packed)) unaligned_br_t;

 /**
 * Unaligned word register, for architectures where that matters.
 */
 typedef struct {
    word_t unaligned;
-} __attribute__((packed)) unaligned_word_t;
+} __attribute__ ((packed)) unaligned_word_t;

 /**
 * @brief Constant-time conditional swap.
@ -60,62 +59,58 @@ typedef struct {
 * as their sizes, if the CPU cares about that sort of thing.
 */
 static __inline__ void
-__attribute__((unused,always_inline))
-constant_time_cond_swap (
-    void *__restrict__ a_,
-    void *__restrict__ b_,
-    word_t elem_bytes,
-    mask_t doswap
-) {
+    __attribute__ ((unused, always_inline))
+    constant_time_cond_swap(void *__restrict__ a_,
+                        void *__restrict__ b_, word_t elem_bytes, mask_t doswap)
+{
    word_t k;
    unsigned char *a = (unsigned char *)a_;
    unsigned char *b = (unsigned char *)b_;
-    
+
    big_register_t br_mask = br_set_to_mask(doswap);
-    for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) {
+    for (k = 0; k <= elem_bytes - sizeof(big_register_t);
+         k += sizeof(big_register_t)) {
        if (elem_bytes % sizeof(big_register_t)) {
            /* unaligned */
            big_register_t xor =
-                ((unaligned_br_t*)(&a[k]))->unaligned
-              ^ ((unaligned_br_t*)(&b[k]))->unaligned;
+                ((unaligned_br_t *) (&a[k]))->unaligned
+                ^ ((unaligned_br_t *) (&b[k]))->unaligned;
            xor &= br_mask;
-            ((unaligned_br_t*)(&a[k]))->unaligned ^= xor;
-            ((unaligned_br_t*)(&b[k]))->unaligned ^= xor;
+            ((unaligned_br_t *) (&a[k]))->unaligned ^= xor;
+            ((unaligned_br_t *) (&b[k]))->unaligned ^= xor;
        } else {
            /* aligned */
-            big_register_t xor =
-                *((big_register_t*)(&a[k]))
-              ^ *((big_register_t*)(&b[k]));
+            big_register_t xor = *((big_register_t *) (&a[k]))
+                ^ *((big_register_t *) (&b[k]));
            xor &= br_mask;
-            *((big_register_t*)(&a[k])) ^= xor;
-            *((big_register_t*)(&b[k])) ^= xor;
+            *((big_register_t *) (&a[k])) ^= xor;
+            *((big_register_t *) (&b[k])) ^= xor;
        }
    }

    if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
-        for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) {
+        for (; k <= elem_bytes - sizeof(word_t); k += sizeof(word_t)) {
            if (elem_bytes % sizeof(word_t)) {
                /* unaligned */
                word_t xor =
-                    ((unaligned_word_t*)(&a[k]))->unaligned
-                  ^ ((unaligned_word_t*)(&b[k]))->unaligned;
+                    ((unaligned_word_t *) (&a[k]))->unaligned
+                    ^ ((unaligned_word_t *) (&b[k]))->unaligned;
                xor &= doswap;
-                ((unaligned_word_t*)(&a[k]))->unaligned ^= xor;
-                ((unaligned_word_t*)(&b[k]))->unaligned ^= xor;
+                ((unaligned_word_t *) (&a[k]))->unaligned ^= xor;
+                ((unaligned_word_t *) (&b[k]))->unaligned ^= xor;
            } else {
                /* aligned */
-                word_t xor =
-                    *((word_t*)(&a[k]))
-                  ^ *((word_t*)(&b[k]));
+                word_t xor = *((word_t *) (&a[k]))
+                    ^ *((word_t *) (&b[k]));
                xor &= doswap;
-                *((word_t*)(&a[k])) ^= xor;
-                *((word_t*)(&b[k])) ^= xor;
+                *((word_t *) (&a[k])) ^= xor;
+                *((word_t *) (&b[k])) ^= xor;
            }
        }
    }
-    
+
    if (elem_bytes % sizeof(word_t)) {
-        for (; k<elem_bytes; k+=1) {
+        for (; k < elem_bytes; k += 1) {
            unsigned char xor = a[k] ^ b[k];
            xor &= doswap;
            a[k] ^= xor;
@ -133,53 +128,60 @@ constant_time_cond_swap (
 * The table and output must not alias.
 */
 static __inline__ void
-__attribute__((unused,always_inline))
-constant_time_lookup (
-    void *__restrict__ out_,
-    const void *table_,
-    word_t elem_bytes,
-    word_t n_table,
-    word_t idx
-) {
+    __attribute__ ((unused, always_inline))
+    constant_time_lookup(void *__restrict__ out_,
+                     const void *table_,
+                     word_t elem_bytes, word_t n_table, word_t idx)
+{
    big_register_t big_one = br_set_to_mask(1), big_i = br_set_to_mask(idx);
-    
+
    /* Can't do pointer arithmetic on void* */
    unsigned char *out = (unsigned char *)out_;
    const unsigned char *table = (const unsigned char *)table_;
-    word_t j,k;
-    
+    word_t j, k;
+
    memset(out, 0, elem_bytes);
-    for (j=0; j<n_table; j++, big_i-=big_one) {        
+    for (j = 0; j < n_table; j++, big_i -= big_one) {
        big_register_t br_mask = br_is_zero(big_i);
        word_t mask;

-        for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) {
+        for (k = 0; k <= elem_bytes - sizeof(big_register_t);
+             k += sizeof(big_register_t)) {
            if (elem_bytes % sizeof(big_register_t)) {
                /* unaligned */
-                ((unaligned_br_t *)(out+k))->unaligned
-			|= br_mask & ((const unaligned_br_t*)(&table[k+j*elem_bytes]))->unaligned;
+                ((unaligned_br_t *) (out + k))->unaligned
+                    |=
+                    br_mask &
+                    ((const unaligned_br_t
+                      *)(&table[k + j * elem_bytes]))->unaligned;
            } else {
                /* aligned */
-                *(big_register_t *)(out+k) |= br_mask & *(const big_register_t*)(&table[k+j*elem_bytes]);
+                *(big_register_t *) (out + k) |=
+                    br_mask & *(const big_register_t
+                                *)(&table[k + j * elem_bytes]);
            }
        }

-        mask = word_is_zero(idx^j);
+        mask = word_is_zero(idx ^ j);
        if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
-            for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) {
+            for (; k <= elem_bytes - sizeof(word_t); k += sizeof(word_t)) {
                if (elem_bytes % sizeof(word_t)) {
                    /* input unaligned, output aligned */
-                    *(word_t *)(out+k) |= mask & ((const unaligned_word_t*)(&table[k+j*elem_bytes]))->unaligned;
+                    *(word_t *) (out + k) |=
+                        mask &
+                        ((const unaligned_word_t
+                          *)(&table[k + j * elem_bytes]))->unaligned;
                } else {
                    /* aligned */
-                    *(word_t *)(out+k) |= mask & *(const word_t*)(&table[k+j*elem_bytes]);
+                    *(word_t *) (out + k) |=
+                        mask & *(const word_t *)(&table[k + j * elem_bytes]);
                }
            }
        }
-        
+
        if (elem_bytes % sizeof(word_t)) {
-            for (; k<elem_bytes; k+=1) {
-                out[k] |= mask & table[k+j*elem_bytes];
+            for (; k < elem_bytes; k += 1) {
+                out[k] |= mask & table[k + j * elem_bytes];
            }
        }
    }
@ -195,58 +197,57 @@ constant_time_lookup (
 * input, it must be equal and not partially overlap.
 */
 static __inline__ void
-__attribute__((unused,always_inline))
-constant_time_select (
-    void *a_,
-    const void *bFalse_,
-    const void *bTrue_,
-    word_t elem_bytes,
-    mask_t mask,
-    size_t alignment_bytes
-) {
+    __attribute__ ((unused, always_inline))
+    constant_time_select(void *a_,
+                     const void *bFalse_,
+                     const void *bTrue_,
+                     word_t elem_bytes, mask_t mask, size_t alignment_bytes)
+{
    unsigned char *a = (unsigned char *)a_;
    const unsigned char *bTrue = (const unsigned char *)bTrue_;
    const unsigned char *bFalse = (const unsigned char *)bFalse_;
    word_t k;
    big_register_t br_mask = br_set_to_mask(mask);
-    
+
    alignment_bytes |= elem_bytes;

-    for (k=0; k<=elem_bytes-sizeof(big_register_t); k+=sizeof(big_register_t)) {
+    for (k = 0; k <= elem_bytes - sizeof(big_register_t);
+         k += sizeof(big_register_t)) {
        if (alignment_bytes % sizeof(big_register_t)) {
            /* unaligned */
-            ((unaligned_br_t*)(&a[k]))->unaligned =
-		  ( br_mask & ((const unaligned_br_t*)(&bTrue [k]))->unaligned)
-		| (~br_mask & ((const unaligned_br_t*)(&bFalse[k]))->unaligned);
+            ((unaligned_br_t *) (&a[k]))->unaligned =
+                (br_mask & ((const unaligned_br_t *)(&bTrue[k]))->unaligned)
+                | (~br_mask &
+                   ((const unaligned_br_t *)(&bFalse[k]))->unaligned);
        } else {
            /* aligned */
-            *(big_register_t *)(a+k) =
-		  ( br_mask & *(const big_register_t*)(&bTrue [k]))
-		| (~br_mask & *(const big_register_t*)(&bFalse[k]));
+            *(big_register_t *) (a + k) =
+                (br_mask & *(const big_register_t *)(&bTrue[k]))
+                | (~br_mask & *(const big_register_t *)(&bFalse[k]));
        }
    }

    if (elem_bytes % sizeof(big_register_t) >= sizeof(word_t)) {
-        for (; k<=elem_bytes-sizeof(word_t); k+=sizeof(word_t)) {
+        for (; k <= elem_bytes - sizeof(word_t); k += sizeof(word_t)) {
            if (alignment_bytes % sizeof(word_t)) {
                /* unaligned */
-                ((unaligned_word_t*)(&a[k]))->unaligned =
-		    ( mask & ((const unaligned_word_t*)(&bTrue [k]))->unaligned)
-		  | (~mask & ((const unaligned_word_t*)(&bFalse[k]))->unaligned);
+                ((unaligned_word_t *) (&a[k]))->unaligned =
+                    (mask & ((const unaligned_word_t *)(&bTrue[k]))->unaligned)
+                    | (~mask &
+                       ((const unaligned_word_t *)(&bFalse[k]))->unaligned);
            } else {
                /* aligned */
-                *(word_t *)(a+k) =
-		    ( mask & *(const word_t*)(&bTrue [k]))
-		  | (~mask & *(const word_t*)(&bFalse[k]));
+                *(word_t *) (a + k) = (mask & *(const word_t *)(&bTrue[k]))
+                    | (~mask & *(const word_t *)(&bFalse[k]));
            }
        }
    }
-    
+
    if (elem_bytes % sizeof(word_t)) {
-        for (; k<elem_bytes; k+=1) {
-            a[k] = ( mask & bTrue[k]) | (~mask & bFalse[k]);
+        for (; k < elem_bytes; k += 1) {
+            a[k] = (mask & bTrue[k]) | (~mask & bFalse[k]);
        }
    }
 }

-#endif /* __CONSTANT_TIME_H__ */
+#endif                          /* __CONSTANT_TIME_H__ */
--- a/crypto/ec/curve448/curve448.c
+++ b/crypto/ec/curve448/curve448.c
--- a/crypto/ec/curve448/curve448_tables.c
+++ b/crypto/ec/curve448/curve448_tables.c
--- a/crypto/ec/curve448/curve448_test.c
+++ b/crypto/ec/curve448/curve448_test.c
@ -72,24 +72,31 @@ const uint8_t in_u3[56] = {

 const uint8_t out_u3[3][56] = {
    {
-        0x3f, 0x48, 0x2c, 0x8a, 0x9f, 0x19, 0xb0, 0x1e, 0x6c, 0x46, 0xee, 0x97,
-        0x11, 0xd9, 0xdc, 0x14, 0xfd, 0x4b, 0xf6, 0x7a, 0xf3, 0x07, 0x65, 0xc2,
-        0xae, 0x2b, 0x84, 0x6a, 0x4d, 0x23, 0xa8, 0xcd, 0x0d, 0xb8, 0x97, 0x08,
-        0x62, 0x39, 0x49, 0x2c, 0xaf, 0x35, 0x0b, 0x51, 0xf8, 0x33, 0x86, 0x8b,
-        0x9b, 0xc2, 0xb3, 0xbc, 0xa9, 0xcf, 0x41, 0x13
-    }, {
-        0xaa, 0x3b, 0x47, 0x49, 0xd5, 0x5b, 0x9d, 0xaf, 0x1e, 0x5b, 0x00, 0x28,
-        0x88, 0x26, 0xc4, 0x67, 0x27, 0x4c, 0xe3, 0xeb, 0xbd, 0xd5, 0xc1, 0x7b,
-        0x97, 0x5e, 0x09, 0xd4, 0xaf, 0x6c, 0x67, 0xcf, 0x10, 0xd0, 0x87, 0x20,
-        0x2d, 0xb8, 0x82, 0x86, 0xe2, 0xb7, 0x9f, 0xce, 0xea, 0x3e, 0xc3, 0x53,
-        0xef, 0x54, 0xfa, 0xa2, 0x6e, 0x21, 0x9f, 0x38
-    }, {
-        0x07, 0x7f, 0x45, 0x36, 0x81, 0xca, 0xca, 0x36, 0x93, 0x19, 0x84, 0x20,
-        0xbb, 0xe5, 0x15, 0xca, 0xe0, 0x00, 0x24, 0x72, 0x51, 0x9b, 0x3e, 0x67,
-        0x66, 0x1a, 0x7e, 0x89, 0xca, 0xb9, 0x46, 0x95, 0xc8, 0xf4, 0xbc, 0xd6,
-        0x6e, 0x61, 0xb9, 0xb9, 0xc9, 0x46, 0xda, 0x8d, 0x52, 0x4d, 0xe3, 0xd6,
-        0x9b, 0xd9, 0xd9, 0xd6, 0x6b, 0x99, 0x7e, 0x37
-    }
+     0x3f, 0x48, 0x2c, 0x8a, 0x9f, 0x19, 0xb0, 0x1e, 0x6c, 0x46, 0xee, 0x97,
+     0x11, 0xd9, 0xdc, 0x14, 0xfd, 0x4b, 0xf6, 0x7a, 0xf3, 0x07, 0x65, 0xc2,
+     0xae, 0x2b, 0x84, 0x6a, 0x4d, 0x23, 0xa8, 0xcd, 0x0d, 0xb8, 0x97, 0x08,
+     0x62, 0x39, 0x49, 0x2c, 0xaf, 0x35, 0x0b, 0x51, 0xf8, 0x33, 0x86, 0x8b,
+     0x9b, 0xc2, 0xb3, 0xbc, 0xa9, 0xcf, 0x41, 0x13}, {
+                                                       0xaa, 0x3b, 0x47, 0x49,
+                                                       0xd5, 0x5b, 0x9d, 0xaf,
+                                                       0x1e, 0x5b, 0x00, 0x28,
+                                                       0x88, 0x26, 0xc4, 0x67,
+                                                       0x27, 0x4c, 0xe3, 0xeb,
+                                                       0xbd, 0xd5, 0xc1, 0x7b,
+                                                       0x97, 0x5e, 0x09, 0xd4,
+                                                       0xaf, 0x6c, 0x67, 0xcf,
+                                                       0x10, 0xd0, 0x87, 0x20,
+                                                       0x2d, 0xb8, 0x82, 0x86,
+                                                       0xe2, 0xb7, 0x9f, 0xce,
+                                                       0xea, 0x3e, 0xc3, 0x53,
+                                                       0xef, 0x54, 0xfa, 0xa2,
+                                                       0x6e, 0x21, 0x9f, 0x38},
+    {
+     0x07, 0x7f, 0x45, 0x36, 0x81, 0xca, 0xca, 0x36, 0x93, 0x19, 0x84, 0x20,
+     0xbb, 0xe5, 0x15, 0xca, 0xe0, 0x00, 0x24, 0x72, 0x51, 0x9b, 0x3e, 0x67,
+     0x66, 0x1a, 0x7e, 0x89, 0xca, 0xb9, 0x46, 0x95, 0xc8, 0xf4, 0xbc, 0xd6,
+     0x6e, 0x61, 0xb9, 0xb9, 0xc9, 0x46, 0xda, 0x8d, 0x52, 0x4d, 0xe3, 0xd6,
+     0x9b, 0xd9, 0xd9, 0xd6, 0x6b, 0x99, 0x7e, 0x37}
 };

 /* Test vectors from RFC8032 for Ed448 */
@ -583,14 +590,13 @@ static const uint8_t *dohash(EVP_MD_CTX *hashctx, const uint8_t *msg,
    static uint8_t hashout[64];

    if (!EVP_DigestInit_ex(hashctx, EVP_shake256(), NULL)
-            || !EVP_DigestUpdate(hashctx, msg, msglen)
-            || !EVP_DigestFinalXOF(hashctx, hashout, sizeof(hashout)))
+        || !EVP_DigestUpdate(hashctx, msg, msglen)
+        || !EVP_DigestFinalXOF(hashctx, hashout, sizeof(hashout)))
        return NULL;

    return hashout;
 }

-
 static int test_eddsa(void)
 {
    uint8_t outsig[114];
@ -614,7 +620,8 @@ static int test_eddsa(void)
        goto err;
    }

-    ED448_sign(outsig, msg3, sizeof(msg3), pubkey3, privkey3, context3, sizeof(context3));
+    ED448_sign(outsig, msg3, sizeof(msg3), pubkey3, privkey3, context3,
+               sizeof(context3));
    if (memcmp(sig3, outsig, sizeof(sig3)) != 0) {
        printf("Calculated sig and expected sig differ (3)\n");
        goto err;
@ -683,7 +690,7 @@ int main(int argc, char *argv[])
    int j = -1;

    if (argc != 1 && (argc != 2 || strcmp(argv[1], "-f") != 0)) {
-        printf ("Usage: curve448_test [-f]\n");
+        printf("Usage: curve448_test [-f]\n");
        return 1;
    }

@ -725,8 +732,9 @@ int main(int argc, char *argv[])
        if (i == 1 || i == 1000 || i == 1000000) {
            j++;
            if (memcmp(out, out_u3[j], sizeof(out)) != 0) {
-                printf("Calculated output and expected output differ (3, %ud)\n",
-                       i);
+                printf
+                    ("Calculated output and expected output differ (3, %ud)\n",
+                     i);
                return 1;
            }
        }
--- a/crypto/ec/curve448/curve448utils.h
+++ b/crypto/ec/curve448/curve448utils.h
@ -11,47 +11,47 @@
 */

 #ifndef __DECAF_COMMON_H__
-#define __DECAF_COMMON_H__ 1
+# define __DECAF_COMMON_H__ 1

-#include <openssl/e_os2.h>
+# include <openssl/e_os2.h>

 #ifdef __cplusplus
 extern "C" {
 #endif

-/* Internal word types.
- *
- * Somewhat tricky.  This could be decided separately per platform.  However,
- * the structs do need to be all the same size and alignment on a given
- * platform to support dynamic linking, since even if you header was built
- * with eg arch_neon, you might end up linking a library built with arch_arm32.
+/*
+ * Internal word types. Somewhat tricky.  This could be decided separately per
+ * platform.  However, the structs do need to be all the same size and
+ * alignment on a given platform to support dynamic linking, since even if you
+ * header was built with eg arch_neon, you might end up linking a library built
+ * with arch_arm32.
 */
-#ifndef DECAF_WORD_BITS
-    #if (defined(__ILP64__) || defined(__amd64__) || defined(__x86_64__) || (((__UINT_FAST32_MAX__)>>30)>>30))
-        #define DECAF_WORD_BITS 64 /**< The number of bits in a word */
-    #else
-        #define DECAF_WORD_BITS 32 /**< The number of bits in a word */
-    #endif
-#endif
-    
-#if DECAF_WORD_BITS == 64
+# ifndef DECAF_WORD_BITS
+#  if (defined(__ILP64__) || defined(__amd64__) || defined(__x86_64__) || (((__UINT_FAST32_MAX__)>>30)>>30))
+#   define DECAF_WORD_BITS 64      /**< The number of bits in a word */
+#  else
+#   define DECAF_WORD_BITS 32      /**< The number of bits in a word */
+#  endif
+# endif
+
+# if DECAF_WORD_BITS == 64
 typedef uint64_t decaf_word_t;      /**< Word size for internal computations */
 typedef int64_t decaf_sword_t;      /**< Signed word size for internal computations */
 typedef uint64_t decaf_bool_t;      /**< "Boolean" type, will be set to all-zero or all-one (i.e. -1u) */
 typedef __uint128_t decaf_dword_t;  /**< Double-word size for internal computations */
 typedef __int128_t decaf_dsword_t;  /**< Signed double-word size for internal computations */
-#elif DECAF_WORD_BITS == 32         /**< The number of bits in a word */
+# elif DECAF_WORD_BITS == 32        /**< The number of bits in a word */
 typedef uint32_t decaf_word_t;      /**< Word size for internal computations */
 typedef int32_t decaf_sword_t;      /**< Signed word size for internal computations */
 typedef uint32_t decaf_bool_t;      /**< "Boolean" type, will be set to all-zero or all-one (i.e. -1u) */
 typedef uint64_t decaf_dword_t;     /**< Double-word size for internal computations */
 typedef int64_t decaf_dsword_t;     /**< Signed double-word size for internal computations */
-#else
-#error "Only supporting DECAF_WORD_BITS = 32 or 64 for now"
-#endif
-    
+# else
+#  error "Only supporting DECAF_WORD_BITS = 32 or 64 for now"
+# endif
+
 /** DECAF_TRUE = -1 so that DECAF_TRUE & x = x */
-static const decaf_bool_t DECAF_TRUE = -(decaf_bool_t)1;
+static const decaf_bool_t DECAF_TRUE = -(decaf_bool_t) 1;

 /** DECAF_FALSE = 0 so that DECAF_FALSE & x = 0 */
 static const decaf_bool_t DECAF_FALSE = 0;
@ -62,22 +62,21 @@ typedef enum {
    DECAF_FAILURE = 0   /**< The operation failed. */
 } decaf_error_t;

-
 /** Return success if x is true */
-static ossl_inline decaf_error_t
-decaf_succeed_if(decaf_bool_t x) {
-    return (decaf_error_t)x;
+static ossl_inline decaf_error_t decaf_succeed_if(decaf_bool_t x)
+{
+    return (decaf_error_t) x;
 }

 /** Return DECAF_TRUE iff x == DECAF_SUCCESS */
-static ossl_inline decaf_bool_t
-decaf_successful(decaf_error_t e) {
-    decaf_dword_t w = ((decaf_word_t)e) ^  ((decaf_word_t)DECAF_SUCCESS);
-    return (w-1)>>DECAF_WORD_BITS;
+static ossl_inline decaf_bool_t decaf_successful(decaf_error_t e)
+{
+    decaf_dword_t w = ((decaf_word_t) e) ^ ((decaf_word_t) DECAF_SUCCESS);
+    return (w - 1) >> DECAF_WORD_BITS;
 }
-    
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
-    
-#endif /* __DECAF_COMMON_H__ */
+
+#endif                          /* __DECAF_COMMON_H__ */
--- a/crypto/ec/curve448/ed448.h
+++ b/crypto/ec/curve448/ed448.h
@ -11,31 +11,31 @@
 */

 #ifndef __DECAF_ED448_H__
-#define __DECAF_ED448_H__ 1
+# define __DECAF_ED448_H__ 1

-#include "point_448.h"
+# include "point_448.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 /** Number of bytes in an EdDSA public key. */
-#define DECAF_EDDSA_448_PUBLIC_BYTES 57
+# define DECAF_EDDSA_448_PUBLIC_BYTES 57

 /** Number of bytes in an EdDSA private key. */
-#define DECAF_EDDSA_448_PRIVATE_BYTES DECAF_EDDSA_448_PUBLIC_BYTES
+# define DECAF_EDDSA_448_PRIVATE_BYTES DECAF_EDDSA_448_PUBLIC_BYTES

 /** Number of bytes in an EdDSA private key. */
-#define DECAF_EDDSA_448_SIGNATURE_BYTES (DECAF_EDDSA_448_PUBLIC_BYTES + DECAF_EDDSA_448_PRIVATE_BYTES)
+# define DECAF_EDDSA_448_SIGNATURE_BYTES (DECAF_EDDSA_448_PUBLIC_BYTES + DECAF_EDDSA_448_PRIVATE_BYTES)

 /** Does EdDSA support non-contextual signatures? */
-#define DECAF_EDDSA_448_SUPPORTS_CONTEXTLESS_SIGS 0
+# define DECAF_EDDSA_448_SUPPORTS_CONTEXTLESS_SIGS 0

 /** EdDSA encoding ratio. */
-#define DECAF_448_EDDSA_ENCODE_RATIO 4
+# define DECAF_448_EDDSA_ENCODE_RATIO 4

 /** EdDSA decoding ratio. */
-#define DECAF_448_EDDSA_DECODE_RATIO (4 / 4)
+# define DECAF_448_EDDSA_DECODE_RATIO (4 / 4)

 /**
 * @brief EdDSA key generation.  This function uses a different (non-Decaf)
@ -43,11 +43,14 @@ extern "C" {
 *
 * @param [out] pubkey The public key.
 * @param [in] privkey The private key.
- */    
-decaf_error_t decaf_ed448_derive_public_key (
-    uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
-    const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES]
-);
+ */
+decaf_error_t decaf_ed448_derive_public_key(uint8_t
+                                            pubkey
+                                            [DECAF_EDDSA_448_PUBLIC_BYTES],
+                                            const uint8_t
+                                            privkey
+                                            [DECAF_EDDSA_448_PRIVATE_BYTES]
+    );

 /**
 * @brief EdDSA signing.
@ -65,17 +68,17 @@ decaf_error_t decaf_ed448_derive_public_key (
 * messages, at least without some very careful protocol-level disambiguation.  For Ed448 it is
 * safe.  The C++ wrapper is designed to make it harder to screw this up, but this C code gives
 * you no seat belt.
- */  
-decaf_error_t decaf_ed448_sign (
-    uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
-    const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
-    const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
-    const uint8_t *message,
-    size_t message_len,
-    uint8_t prehashed,
-    const uint8_t *context,
-    size_t context_len
-) __attribute__((nonnull(1,2,3)));
+ */
+decaf_error_t decaf_ed448_sign(uint8_t
+                               signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
+                               const uint8_t
+                               privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
+                               const uint8_t
+                               pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
+                               const uint8_t *message, size_t message_len,
+                               uint8_t prehashed, const uint8_t *context,
+                               size_t context_len)
+    __attribute__ ((nonnull(1, 2, 3)));

 /**
 * @brief EdDSA signing with prehash.
@ -91,15 +94,18 @@ decaf_error_t decaf_ed448_sign (
 * messages, at least without some very careful protocol-level disambiguation.  For Ed448 it is
 * safe.  The C++ wrapper is designed to make it harder to screw this up, but this C code gives
 * you no seat belt.
- */  
-decaf_error_t decaf_ed448_sign_prehash (
-    uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
-    const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
-    const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
-    const uint8_t hash[64],
-    const uint8_t *context,
-    size_t context_len
-) __attribute__((nonnull(1,2,3,4)));
+ */
+decaf_error_t decaf_ed448_sign_prehash(uint8_t
+                                       signature
+                                       [DECAF_EDDSA_448_SIGNATURE_BYTES],
+                                       const uint8_t
+                                       privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
+                                       const uint8_t
+                                       pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
+                                       const uint8_t hash[64],
+                                       const uint8_t *context,
+                                       size_t context_len)
+    __attribute__ ((nonnull(1, 2, 3, 4)));

 /**
 * @brief EdDSA signature verification.
@ -119,15 +125,14 @@ decaf_error_t decaf_ed448_sign_prehash (
 * safe.  The C++ wrapper is designed to make it harder to screw this up, but this C code gives
 * you no seat belt.
 */
-decaf_error_t decaf_ed448_verify (
-    const uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
-    const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
-    const uint8_t *message,
-    size_t message_len,
-    uint8_t prehashed,
-    const uint8_t *context,
-    uint8_t context_len
-) __attribute__((nonnull(1,2)));
+decaf_error_t decaf_ed448_verify(const uint8_t
+                                 signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
+                                 const uint8_t
+                                 pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
+                                 const uint8_t *message, size_t message_len,
+                                 uint8_t prehashed, const uint8_t *context,
+                                 uint8_t context_len)
+    __attribute__ ((nonnull(1, 2)));

 /**
 * @brief EdDSA signature verification.
@ -145,13 +150,15 @@ decaf_error_t decaf_ed448_verify (
 * safe.  The C++ wrapper is designed to make it harder to screw this up, but this C code gives
 * you no seat belt.
 */
-decaf_error_t decaf_ed448_verify_prehash (
-    const uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
-    const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
-    const uint8_t hash[64],
-    const uint8_t *context,
-    uint8_t context_len
-) __attribute__((nonnull(1,2)));
+decaf_error_t decaf_ed448_verify_prehash(const uint8_t
+                                         signature
+                                         [DECAF_EDDSA_448_SIGNATURE_BYTES],
+                                         const uint8_t
+                                         pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
+                                         const uint8_t hash[64],
+                                         const uint8_t *context,
+                                         uint8_t context_len)
+    __attribute__ ((nonnull(1, 2)));

 /**
 * @brief EdDSA point encoding.  Used internally, exposed externally.
@ -176,11 +183,12 @@ decaf_error_t decaf_ed448_verify_prehash (
 *
 * @param [out] enc The encoded point.
 * @param [in] p The point.
- */       
-void curve448_point_mul_by_ratio_and_encode_like_eddsa (
-    uint8_t enc[DECAF_EDDSA_448_PUBLIC_BYTES],
-    const curve448_point_t p
-);
+ */
+void curve448_point_mul_by_ratio_and_encode_like_eddsa(uint8_t
+                                                       enc
+                                                       [DECAF_EDDSA_448_PUBLIC_BYTES],
+                                                       const curve448_point_t
+                                                       p);

 /**
 * @brief EdDSA point decoding.  Multiplies by DECAF_448_EDDSA_DECODE_RATIO,
@ -190,11 +198,13 @@ void curve448_point_mul_by_ratio_and_encode_like_eddsa (
 *
 * @param [out] enc The encoded point.
 * @param [in] p The point.
- */       
-decaf_error_t curve448_point_decode_like_eddsa_and_mul_by_ratio (
-    curve448_point_t p,
-    const uint8_t enc[DECAF_EDDSA_448_PUBLIC_BYTES]
-);
+ */
+decaf_error_t curve448_point_decode_like_eddsa_and_mul_by_ratio(curve448_point_t
+                                                                p,
+                                                                const uint8_t
+                                                                enc
+                                                                [DECAF_EDDSA_448_PUBLIC_BYTES]
+    );

 /**
 * @brief EdDSA to ECDH public key conversion
@ -207,10 +217,10 @@ decaf_error_t curve448_point_decode_like_eddsa_and_mul_by_ratio (
 * @param[out] x The ECDH public key as in RFC7748(point on Montgomery curve)
 * @param[in] ed The EdDSA public key(point on Edwards curve)
 */
-void decaf_ed448_convert_public_key_to_x448 (
-    uint8_t x[DECAF_X448_PUBLIC_BYTES],
-    const uint8_t ed[DECAF_EDDSA_448_PUBLIC_BYTES]
-);
+void decaf_ed448_convert_public_key_to_x448(uint8_t x[DECAF_X448_PUBLIC_BYTES],
+                                            const uint8_t
+                                            ed[DECAF_EDDSA_448_PUBLIC_BYTES]
+    );

 /**
 * @brief EdDSA to ECDH private key conversion
@ -220,13 +230,16 @@ void decaf_ed448_convert_public_key_to_x448 (
 * @param[out] x The ECDH private key as in RFC7748
 * @param[in] ed The EdDSA private key
 */
-decaf_error_t decaf_ed448_convert_private_key_to_x448 (
-    uint8_t x[DECAF_X448_PRIVATE_BYTES],
-    const uint8_t ed[DECAF_EDDSA_448_PRIVATE_BYTES]
-);
+decaf_error_t decaf_ed448_convert_private_key_to_x448(uint8_t
+                                                      x
+                                                      [DECAF_X448_PRIVATE_BYTES],
+                                                      const uint8_t
+                                                      ed
+                                                      [DECAF_EDDSA_448_PRIVATE_BYTES]
+    );

 #ifdef __cplusplus
 } /* extern "C" */
 #endif

-#endif /* __DECAF_ED448_H__ */
+#endif                          /* __DECAF_ED448_H__ */
--- a/crypto/ec/curve448/eddsa.c
+++ b/crypto/ec/curve448/eddsa.c
@ -27,12 +27,12 @@

 #if NO_CONTEXT
 const uint8_t NO_CONTEXT_POINTS_HERE = 0;
-const uint8_t * const DECAF_ED448_NO_CONTEXT = &NO_CONTEXT_POINTS_HERE;
+const uint8_t *const DECAF_ED448_NO_CONTEXT = &NO_CONTEXT_POINTS_HERE;
 #endif

-/* EDDSA_BASE_POINT_RATIO = 1 or 2
- * Because EdDSA25519 is not on E_d but on the isogenous E_sigma_d,
- * its base point is twice ours.
+/*
+ * EDDSA_BASE_POINT_RATIO = 1 or 2 Because EdDSA25519 is not on E_d but on the
+ * isogenous E_sigma_d, its base point is twice ours.
 */
 #define EDDSA_BASE_POINT_RATIO (1+EDDSA_USE_SIGMA_ISOGENY) /* TODO: remove */

@ -45,8 +45,8 @@ static decaf_error_t oneshot_hash(uint8_t *out, size_t outlen,
        return DECAF_FAILURE;

    if (!EVP_DigestInit_ex(hashctx, EVP_shake256(), NULL)
-            || !EVP_DigestUpdate(hashctx, in, inlen)
-            || !EVP_DigestFinalXOF(hashctx, out, outlen)) {
+        || !EVP_DigestUpdate(hashctx, in, inlen)
+        || !EVP_DigestFinalXOF(hashctx, out, outlen)) {
        EVP_MD_CTX_free(hashctx);
        return DECAF_FAILURE;
    }
@ -55,11 +55,10 @@ static decaf_error_t oneshot_hash(uint8_t *out, size_t outlen,
    return DECAF_SUCCESS;
 }

-
-static void clamp (
-    uint8_t secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES]
-) {
-    uint8_t hibit = (1<<0)>>1;
+static void clamp(uint8_t secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES]
+    )
+{
+    uint8_t hibit = (1 << 0) >> 1;

    /* Blarg */
    secret_scalar_ser[0] &= -COFACTOR;
@ -67,18 +66,17 @@ static void clamp (
        secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 1] = 0;
        secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 2] |= 0x80;
    } else {
-        secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 1] &= hibit-1;
+        secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 1] &= hibit - 1;
        secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES - 1] |= hibit;
    }
 }

-static decaf_error_t hash_init_with_dom(
-    EVP_MD_CTX *hashctx,
-    uint8_t prehashed,
-    uint8_t for_prehash,
-    const uint8_t *context,
-    size_t context_len
-) {
+static decaf_error_t hash_init_with_dom(EVP_MD_CTX *hashctx,
+                                        uint8_t prehashed,
+                                        uint8_t for_prehash,
+                                        const uint8_t *context,
+                                        size_t context_len)
+{
    const char *dom_s = "SigEd448";
    uint8_t dom[2];

@ -99,33 +97,38 @@ static decaf_error_t hash_init_with_dom(
 #endif

    if (!EVP_DigestInit_ex(hashctx, EVP_shake256(), NULL)
-            || !EVP_DigestUpdate(hashctx, dom_s, strlen(dom_s))
-            || !EVP_DigestUpdate(hashctx, dom, sizeof(dom))
-            || !EVP_DigestUpdate(hashctx, context, context_len))
+        || !EVP_DigestUpdate(hashctx, dom_s, strlen(dom_s))
+        || !EVP_DigestUpdate(hashctx, dom, sizeof(dom))
+        || !EVP_DigestUpdate(hashctx, context, context_len))
        return DECAF_FAILURE;

    return DECAF_SUCCESS;
 }

 /* In this file because it uses the hash */
-decaf_error_t decaf_ed448_convert_private_key_to_x448 (
-    uint8_t x[DECAF_X448_PRIVATE_BYTES],
-    const uint8_t ed[DECAF_EDDSA_448_PRIVATE_BYTES]
-) {
+decaf_error_t decaf_ed448_convert_private_key_to_x448(uint8_t
+                                                      x
+                                                      [DECAF_X448_PRIVATE_BYTES],
+                                                      const uint8_t
+                                                      ed
+                                                      [DECAF_EDDSA_448_PRIVATE_BYTES]
+    )
+{
    /* pass the private key through oneshot_hash function */
    /* and keep the first DECAF_X448_PRIVATE_BYTES bytes */
-    return oneshot_hash(
-        x,
-        DECAF_X448_PRIVATE_BYTES,
-        ed,
-        DECAF_EDDSA_448_PRIVATE_BYTES
-    );
+    return oneshot_hash(x,
+                        DECAF_X448_PRIVATE_BYTES,
+                        ed, DECAF_EDDSA_448_PRIVATE_BYTES);
 }
-    
-decaf_error_t decaf_ed448_derive_public_key (
-    uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
-    const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES]
-) {
+
+decaf_error_t decaf_ed448_derive_public_key(uint8_t
+                                            pubkey
+                                            [DECAF_EDDSA_448_PUBLIC_BYTES],
+                                            const uint8_t
+                                            privkey
+                                            [DECAF_EDDSA_448_PRIVATE_BYTES]
+    )
+{
    /* only this much used for keygen */
    uint8_t secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES];
    curve448_scalar_t secret_scalar;
@ -138,22 +141,25 @@ decaf_error_t decaf_ed448_derive_public_key (
    }
    clamp(secret_scalar_ser);

-    curve448_scalar_decode_long(secret_scalar, secret_scalar_ser, sizeof(secret_scalar_ser));
-    
-    /* Since we are going to mul_by_cofactor during encoding, divide by it here.
-     * However, the EdDSA base point is not the same as the decaf base point if
-     * the sigma isogeny is in use: the EdDSA base point is on Etwist_d/(1-d) and
-     * the decaf base point is on Etwist_d, and when converted it effectively
-     * picks up a factor of 2 from the isogenies.  So we might start at 2 instead of 1. 
+    curve448_scalar_decode_long(secret_scalar, secret_scalar_ser,
+                                sizeof(secret_scalar_ser));
+
+    /*
+     * Since we are going to mul_by_cofactor during encoding, divide by it
+     * here. However, the EdDSA base point is not the same as the decaf base
+     * point if the sigma isogeny is in use: the EdDSA base point is on
+     * Etwist_d/(1-d) and the decaf base point is on Etwist_d, and when
+     * converted it effectively picks up a factor of 2 from the isogenies.  So
+     * we might start at 2 instead of 1.
     */
-    for (c=1; c<DECAF_448_EDDSA_ENCODE_RATIO; c <<= 1) {
-        curve448_scalar_halve(secret_scalar,secret_scalar);
+    for (c = 1; c < DECAF_448_EDDSA_ENCODE_RATIO; c <<= 1) {
+        curve448_scalar_halve(secret_scalar, secret_scalar);
    }
-    
-    curve448_precomputed_scalarmul(p,curve448_precomputed_base,secret_scalar);
-    
+
+    curve448_precomputed_scalarmul(p, curve448_precomputed_base, secret_scalar);
+
    curve448_point_mul_by_ratio_and_encode_like_eddsa(pubkey, p);
-        
+
    /* Cleanup */
    curve448_scalar_destroy(secret_scalar);
    curve448_point_destroy(p);
@ -162,21 +168,21 @@ decaf_error_t decaf_ed448_derive_public_key (
    return DECAF_SUCCESS;
 }

-decaf_error_t decaf_ed448_sign (
-    uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
-    const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
-    const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
-    const uint8_t *message,
-    size_t message_len,
-    uint8_t prehashed,
-    const uint8_t *context,
-    size_t context_len
-) {
+decaf_error_t decaf_ed448_sign(uint8_t
+                               signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
+                               const uint8_t
+                               privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
+                               const uint8_t
+                               pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
+                               const uint8_t *message, size_t message_len,
+                               uint8_t prehashed, const uint8_t *context,
+                               size_t context_len)
+{
    curve448_scalar_t secret_scalar;
    EVP_MD_CTX *hashctx = EVP_MD_CTX_new();
    decaf_error_t ret = DECAF_FAILURE;
    curve448_scalar_t nonce_scalar;
-    uint8_t nonce_point[DECAF_EDDSA_448_PUBLIC_BYTES] = {0};
+    uint8_t nonce_point[DECAF_EDDSA_448_PUBLIC_BYTES] = { 0 };
    unsigned int c;
    curve448_scalar_t challenge_scalar;

@ -188,28 +194,28 @@ decaf_error_t decaf_ed448_sign (
        struct {
            uint8_t secret_scalar_ser[DECAF_EDDSA_448_PRIVATE_BYTES];
            uint8_t seed[DECAF_EDDSA_448_PRIVATE_BYTES];
-        } __attribute__((packed)) expanded;
+        } __attribute__ ((packed)) expanded;

        if (!oneshot_hash((uint8_t *)&expanded, sizeof(expanded), privkey,
                          DECAF_EDDSA_448_PRIVATE_BYTES))
            goto err;
-        clamp(expanded.secret_scalar_ser);   
-        curve448_scalar_decode_long(secret_scalar, expanded.secret_scalar_ser, sizeof(expanded.secret_scalar_ser));
-    
+        clamp(expanded.secret_scalar_ser);
+        curve448_scalar_decode_long(secret_scalar, expanded.secret_scalar_ser,
+                                    sizeof(expanded.secret_scalar_ser));
+
        /* Hash to create the nonce */
        if (!hash_init_with_dom(hashctx, prehashed, 0, context, context_len)
-                || !EVP_DigestUpdate(hashctx, expanded.seed,
-                                     sizeof(expanded.seed))
-                || !EVP_DigestUpdate(hashctx, message, message_len)) {
+            || !EVP_DigestUpdate(hashctx, expanded.seed, sizeof(expanded.seed))
+            || !EVP_DigestUpdate(hashctx, message, message_len)) {
            OPENSSL_cleanse(&expanded, sizeof(expanded));
            goto err;
        }
        OPENSSL_cleanse(&expanded, sizeof(expanded));
    }
-    
+
    /* Decode the nonce */
    {
-        uint8_t nonce[2*DECAF_EDDSA_448_PRIVATE_BYTES];
+        uint8_t nonce[2 * DECAF_EDDSA_448_PRIVATE_BYTES];

        if (!EVP_DigestFinalXOF(hashctx, nonce, sizeof(nonce)))
            goto err;
@ -222,40 +228,42 @@ decaf_error_t decaf_ed448_sign (
        curve448_scalar_t nonce_scalar_2;
        curve448_point_t p;

-        curve448_scalar_halve(nonce_scalar_2,nonce_scalar);
+        curve448_scalar_halve(nonce_scalar_2, nonce_scalar);
        for (c = 2; c < DECAF_448_EDDSA_ENCODE_RATIO; c <<= 1) {
-            curve448_scalar_halve(nonce_scalar_2,nonce_scalar_2);
+            curve448_scalar_halve(nonce_scalar_2, nonce_scalar_2);
        }

-        curve448_precomputed_scalarmul(p,curve448_precomputed_base,nonce_scalar_2);
+        curve448_precomputed_scalarmul(p, curve448_precomputed_base,
+                                       nonce_scalar_2);
        curve448_point_mul_by_ratio_and_encode_like_eddsa(nonce_point, p);
        curve448_point_destroy(p);
        curve448_scalar_destroy(nonce_scalar_2);
    }

    {
-        uint8_t challenge[2*DECAF_EDDSA_448_PRIVATE_BYTES];
+        uint8_t challenge[2 * DECAF_EDDSA_448_PRIVATE_BYTES];

        /* Compute the challenge */
        if (!hash_init_with_dom(hashctx, prehashed, 0, context, context_len)
-                || !EVP_DigestUpdate(hashctx, nonce_point, sizeof(nonce_point))
-                || !EVP_DigestUpdate(hashctx, pubkey,
-                                     DECAF_EDDSA_448_PUBLIC_BYTES)
-                || !EVP_DigestUpdate(hashctx, message, message_len)
-                || !EVP_DigestFinalXOF(hashctx, challenge, sizeof(challenge)))
+            || !EVP_DigestUpdate(hashctx, nonce_point, sizeof(nonce_point))
+            || !EVP_DigestUpdate(hashctx, pubkey, DECAF_EDDSA_448_PUBLIC_BYTES)
+            || !EVP_DigestUpdate(hashctx, message, message_len)
+            || !EVP_DigestFinalXOF(hashctx, challenge, sizeof(challenge)))
            goto err;

-        curve448_scalar_decode_long(challenge_scalar,challenge,sizeof(challenge));
-        OPENSSL_cleanse(challenge,sizeof(challenge));
+        curve448_scalar_decode_long(challenge_scalar, challenge,
+                                    sizeof(challenge));
+        OPENSSL_cleanse(challenge, sizeof(challenge));
    }
-    
-    curve448_scalar_mul(challenge_scalar,challenge_scalar,secret_scalar);
-    curve448_scalar_add(challenge_scalar,challenge_scalar,nonce_scalar);
-    
-    OPENSSL_cleanse(signature,DECAF_EDDSA_448_SIGNATURE_BYTES);
-    memcpy(signature,nonce_point,sizeof(nonce_point));
-    curve448_scalar_encode(&signature[DECAF_EDDSA_448_PUBLIC_BYTES],challenge_scalar);
-    
+
+    curve448_scalar_mul(challenge_scalar, challenge_scalar, secret_scalar);
+    curve448_scalar_add(challenge_scalar, challenge_scalar, nonce_scalar);
+
+    OPENSSL_cleanse(signature, DECAF_EDDSA_448_SIGNATURE_BYTES);
+    memcpy(signature, nonce_point, sizeof(nonce_point));
+    curve448_scalar_encode(&signature[DECAF_EDDSA_448_PUBLIC_BYTES],
+                           challenge_scalar);
+
    curve448_scalar_destroy(secret_scalar);
    curve448_scalar_destroy(nonce_scalar);
    curve448_scalar_destroy(challenge_scalar);
@ -266,97 +274,103 @@ decaf_error_t decaf_ed448_sign (
    return ret;
 }

-
-decaf_error_t decaf_ed448_sign_prehash (
-    uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
-    const uint8_t privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
-    const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
-    const uint8_t hash[64],
-    const uint8_t *context,
-    size_t context_len
-) {
-    return decaf_ed448_sign(signature,privkey,pubkey,hash,64,1,context,
+decaf_error_t decaf_ed448_sign_prehash(uint8_t
+                                       signature
+                                       [DECAF_EDDSA_448_SIGNATURE_BYTES],
+                                       const uint8_t
+                                       privkey[DECAF_EDDSA_448_PRIVATE_BYTES],
+                                       const uint8_t
+                                       pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
+                                       const uint8_t hash[64],
+                                       const uint8_t *context,
+                                       size_t context_len)
+{
+    return decaf_ed448_sign(signature, privkey, pubkey, hash, 64, 1, context,
                            context_len);
-    /*OPENSSL_cleanse(hash,sizeof(hash));*/
+    /*
+     * OPENSSL_cleanse(hash,sizeof(hash));
+     */
 }

-decaf_error_t decaf_ed448_verify (
-    const uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
-    const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
-    const uint8_t *message,
-    size_t message_len,
-    uint8_t prehashed,
-    const uint8_t *context,
-    uint8_t context_len
-) { 
+decaf_error_t decaf_ed448_verify(const uint8_t
+                                 signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
+                                 const uint8_t
+                                 pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
+                                 const uint8_t *message, size_t message_len,
+                                 uint8_t prehashed, const uint8_t *context,
+                                 uint8_t context_len)
+{
    curve448_point_t pk_point, r_point;
-    decaf_error_t error = curve448_point_decode_like_eddsa_and_mul_by_ratio(pk_point,pubkey);
+    decaf_error_t error =
+        curve448_point_decode_like_eddsa_and_mul_by_ratio(pk_point, pubkey);
    curve448_scalar_t challenge_scalar;
    curve448_scalar_t response_scalar;
    unsigned int c;

-    if (DECAF_SUCCESS != error) { return error; }
-    
-    error = curve448_point_decode_like_eddsa_and_mul_by_ratio(r_point,signature);
-    if (DECAF_SUCCESS != error) { return error; }
-    
+    if (DECAF_SUCCESS != error) {
+        return error;
+    }
+
+    error =
+        curve448_point_decode_like_eddsa_and_mul_by_ratio(r_point, signature);
+    if (DECAF_SUCCESS != error) {
+        return error;
+    }
+
    {
        /* Compute the challenge */
        EVP_MD_CTX *hashctx = EVP_MD_CTX_new();
-        uint8_t challenge[2*DECAF_EDDSA_448_PRIVATE_BYTES];
+        uint8_t challenge[2 * DECAF_EDDSA_448_PRIVATE_BYTES];

        if (hashctx == NULL
-                || !hash_init_with_dom(hashctx, prehashed, 0, context,
-                                       context_len)
-                || !EVP_DigestUpdate(hashctx, signature,
-                                     DECAF_EDDSA_448_PUBLIC_BYTES)
-                || !EVP_DigestUpdate(hashctx, pubkey,
-                                     DECAF_EDDSA_448_PUBLIC_BYTES)
-                || !EVP_DigestUpdate(hashctx, message, message_len)
-                || !EVP_DigestFinalXOF(hashctx, challenge, sizeof(challenge))) {
+            || !hash_init_with_dom(hashctx, prehashed, 0, context, context_len)
+            || !EVP_DigestUpdate(hashctx, signature,
+                                 DECAF_EDDSA_448_PUBLIC_BYTES)
+            || !EVP_DigestUpdate(hashctx, pubkey, DECAF_EDDSA_448_PUBLIC_BYTES)
+            || !EVP_DigestUpdate(hashctx, message, message_len)
+            || !EVP_DigestFinalXOF(hashctx, challenge, sizeof(challenge))) {
            EVP_MD_CTX_free(hashctx);
            return DECAF_FAILURE;
        }

        EVP_MD_CTX_free(hashctx);
-        curve448_scalar_decode_long(challenge_scalar,challenge,sizeof(challenge));
-        OPENSSL_cleanse(challenge,sizeof(challenge));
+        curve448_scalar_decode_long(challenge_scalar, challenge,
+                                    sizeof(challenge));
+        OPENSSL_cleanse(challenge, sizeof(challenge));
    }
-    curve448_scalar_sub(challenge_scalar, curve448_scalar_zero, challenge_scalar);
+    curve448_scalar_sub(challenge_scalar, curve448_scalar_zero,
+                        challenge_scalar);

-    curve448_scalar_decode_long(
-        response_scalar,
-        &signature[DECAF_EDDSA_448_PUBLIC_BYTES],
-        DECAF_EDDSA_448_PRIVATE_BYTES
-    );
-    
-    for (c=1; c<DECAF_448_EDDSA_DECODE_RATIO; c<<=1) {
-        curve448_scalar_add(response_scalar,response_scalar,response_scalar);
+    curve448_scalar_decode_long(response_scalar,
+                                &signature[DECAF_EDDSA_448_PUBLIC_BYTES],
+                                DECAF_EDDSA_448_PRIVATE_BYTES);
+
+    for (c = 1; c < DECAF_448_EDDSA_DECODE_RATIO; c <<= 1) {
+        curve448_scalar_add(response_scalar, response_scalar, response_scalar);
    }
-    
-    
+
    /* pk_point = -c(x(P)) + (cx + k)G = kG */
-    curve448_base_double_scalarmul_non_secret(
-        pk_point,
-        response_scalar,
-        pk_point,
-        challenge_scalar
-    );
-    return decaf_succeed_if(curve448_point_eq(pk_point,r_point));
+    curve448_base_double_scalarmul_non_secret(pk_point,
+                                              response_scalar,
+                                              pk_point, challenge_scalar);
+    return decaf_succeed_if(curve448_point_eq(pk_point, r_point));
 }

-
-decaf_error_t decaf_ed448_verify_prehash (
-    const uint8_t signature[DECAF_EDDSA_448_SIGNATURE_BYTES],
-    const uint8_t pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
-    const uint8_t hash[64],
-    const uint8_t *context,
-    uint8_t context_len
-) {
+decaf_error_t decaf_ed448_verify_prehash(const uint8_t
+                                         signature
+                                         [DECAF_EDDSA_448_SIGNATURE_BYTES],
+                                         const uint8_t
+                                         pubkey[DECAF_EDDSA_448_PUBLIC_BYTES],
+                                         const uint8_t hash[64],
+                                         const uint8_t *context,
+                                         uint8_t context_len)
+{
    decaf_error_t ret;
-    
-    ret = decaf_ed448_verify(signature,pubkey,hash,64,1,context,context_len);
-    
+
+    ret =
+        decaf_ed448_verify(signature, pubkey, hash, 64, 1, context,
+                           context_len);
+
    return ret;
 }

@ -367,10 +381,9 @@ int ED448_sign(uint8_t *out_sig, const uint8_t *message, size_t message_len,

    return decaf_ed448_sign(out_sig, private_key, public_key, message,
                            message_len, 0, context, context_len)
-                            == DECAF_SUCCESS;
+        == DECAF_SUCCESS;
 }

-
 int ED448_verify(const uint8_t *message, size_t message_len,
                 const uint8_t signature[114], const uint8_t public_key[57],
                 const uint8_t *context, size_t context_len)
@ -397,8 +410,8 @@ int ED448ph_verify(const uint8_t hash[64], const uint8_t signature[114],
 }

 int ED448_public_from_private(uint8_t out_public_key[57],
-                               const uint8_t private_key[57])
+                              const uint8_t private_key[57])
 {
    return decaf_ed448_derive_public_key(out_public_key, private_key)
-           == DECAF_SUCCESS;
+        == DECAF_SUCCESS;
 }
--- a/crypto/ec/curve448/f_arithmetic.c
+++ b/crypto/ec/curve448/f_arithmetic.c
@ -12,37 +12,35 @@

 #include "field.h"

-mask_t gf_isr (
-    gf a,
-    const gf x
-) {
+mask_t gf_isr(gf a, const gf x)
+{
    gf L0, L1, L2;
-    gf_sqr  (L1,     x );
-    gf_mul  (L2,     x,   L1 );
-    gf_sqr  (L1,   L2 );
-    gf_mul  (L2,     x,   L1 );
-    gf_sqrn (L1,   L2,     3 );
-    gf_mul  (L0,   L2,   L1 );
-    gf_sqrn (L1,   L0,     3 );
-    gf_mul  (L0,   L2,   L1 );
-    gf_sqrn (L2,   L0,     9 );
-    gf_mul  (L1,   L0,   L2 );
-    gf_sqr  (L0,   L1 );
-    gf_mul  (L2,     x,   L0 );
-    gf_sqrn (L0,   L2,    18 );
-    gf_mul  (L2,   L1,   L0 );
-    gf_sqrn (L0,   L2,    37 );
-    gf_mul  (L1,   L2,   L0 );
-    gf_sqrn (L0,   L1,    37 );
-    gf_mul  (L1,   L2,   L0 );
-    gf_sqrn (L0,   L1,   111 );
-    gf_mul  (L2,   L1,   L0 );
-    gf_sqr  (L0,   L2 );
-    gf_mul  (L1,     x,   L0 );
-    gf_sqrn (L0,   L1,   223 );
-    gf_mul  (L1,   L2,   L0 );
-    gf_sqr  (L2, L1);
-    gf_mul  (L0, L2, x);
-    gf_copy(a,L1);
-    return gf_eq(L0,ONE);
+    gf_sqr(L1, x);
+    gf_mul(L2, x, L1);
+    gf_sqr(L1, L2);
+    gf_mul(L2, x, L1);
+    gf_sqrn(L1, L2, 3);
+    gf_mul(L0, L2, L1);
+    gf_sqrn(L1, L0, 3);
+    gf_mul(L0, L2, L1);
+    gf_sqrn(L2, L0, 9);
+    gf_mul(L1, L0, L2);
+    gf_sqr(L0, L1);
+    gf_mul(L2, x, L0);
+    gf_sqrn(L0, L2, 18);
+    gf_mul(L2, L1, L0);
+    gf_sqrn(L0, L2, 37);
+    gf_mul(L1, L2, L0);
+    gf_sqrn(L0, L1, 37);
+    gf_mul(L1, L2, L0);
+    gf_sqrn(L0, L1, 111);
+    gf_mul(L2, L1, L0);
+    gf_sqr(L0, L2);
+    gf_mul(L1, x, L0);
+    gf_sqrn(L0, L1, 223);
+    gf_mul(L1, L2, L0);
+    gf_sqr(L2, L1);
+    gf_mul(L0, L2, x);
+    gf_copy(a, L1);
+    return gf_eq(L0, ONE);
 }
--- a/crypto/ec/curve448/f_field.h
+++ b/crypto/ec/curve448/f_field.h
@ -11,91 +11,97 @@
 */

 #ifndef __P448_F_FIELD_H__
-#define __P448_F_FIELD_H__ 1
+# define __P448_F_FIELD_H__ 1

-#include "constant_time.h"
-#include <string.h>
-#include <assert.h>
+# include "constant_time.h"
+# include <string.h>
+# include <assert.h>

-#include "word.h"
+# include "word.h"

-#define __DECAF_448_GF_DEFINED__ 1
-#define NLIMBS (64/sizeof(word_t))
-#define X_SER_BYTES 56
-#define SER_BYTES 56
+# define __DECAF_448_GF_DEFINED__ 1
+# define NLIMBS (64/sizeof(word_t))
+# define X_SER_BYTES 56
+# define SER_BYTES 56
 typedef struct gf_448_s {
    word_t limb[NLIMBS];
-} __attribute__((aligned(32))) gf_448_s, gf_448_t[1];
+} __attribute__ ((aligned(32))) gf_448_s, gf_448_t[1];

-#define GF_LIT_LIMB_BITS  56
-#define GF_BITS           448
-#define ZERO              gf_448_ZERO
-#define ONE               gf_448_ONE
-#define MODULUS           gf_448_MODULUS
-#define gf                gf_448_t
-#define gf_s              gf_448_s
-#define gf_eq             gf_448_eq
-#define gf_hibit          gf_448_hibit
-#define gf_lobit          gf_448_lobit
-#define gf_copy           gf_448_copy
-#define gf_add            gf_448_add
-#define gf_sub            gf_448_sub
-#define gf_add_RAW        gf_448_add_RAW
-#define gf_sub_RAW        gf_448_sub_RAW
-#define gf_bias           gf_448_bias
-#define gf_weak_reduce    gf_448_weak_reduce
-#define gf_strong_reduce  gf_448_strong_reduce
-#define gf_mul            gf_448_mul
-#define gf_sqr            gf_448_sqr
-#define gf_mulw_unsigned  gf_448_mulw_unsigned
-#define gf_isr            gf_448_isr
-#define gf_serialize      gf_448_serialize
-#define gf_deserialize    gf_448_deserialize
+# define GF_LIT_LIMB_BITS  56
+# define GF_BITS           448
+# define ZERO              gf_448_ZERO
+# define ONE               gf_448_ONE
+# define MODULUS           gf_448_MODULUS
+# define gf                gf_448_t
+# define gf_s              gf_448_s
+# define gf_eq             gf_448_eq
+# define gf_hibit          gf_448_hibit
+# define gf_lobit          gf_448_lobit
+# define gf_copy           gf_448_copy
+# define gf_add            gf_448_add
+# define gf_sub            gf_448_sub
+# define gf_add_RAW        gf_448_add_RAW
+# define gf_sub_RAW        gf_448_sub_RAW
+# define gf_bias           gf_448_bias
+# define gf_weak_reduce    gf_448_weak_reduce
+# define gf_strong_reduce  gf_448_strong_reduce
+# define gf_mul            gf_448_mul
+# define gf_sqr            gf_448_sqr
+# define gf_mulw_unsigned  gf_448_mulw_unsigned
+# define gf_isr            gf_448_isr
+# define gf_serialize      gf_448_serialize
+# define gf_deserialize    gf_448_deserialize

 /* RFC 7748 support */
-#define X_PUBLIC_BYTES  X_SER_BYTES
-#define X_PRIVATE_BYTES X_PUBLIC_BYTES
-#define X_PRIVATE_BITS  448
+# define X_PUBLIC_BYTES  X_SER_BYTES
+# define X_PRIVATE_BYTES X_PUBLIC_BYTES
+# define X_PRIVATE_BITS  448

-#define INLINE_UNUSED __inline__ __attribute__((unused,always_inline))
+# define INLINE_UNUSED __inline__ __attribute__((unused,always_inline))

 #ifdef __cplusplus
 extern "C" {
 #endif

 /* Defined below in f_impl.h */
-static INLINE_UNUSED void gf_copy (gf out, const gf a) { *out = *a; }
-static INLINE_UNUSED void gf_add_RAW (gf out, const gf a, const gf b);
-static INLINE_UNUSED void gf_sub_RAW (gf out, const gf a, const gf b);
-static INLINE_UNUSED void gf_bias (gf inout, int amount);
-static INLINE_UNUSED void gf_weak_reduce (gf inout);
+static INLINE_UNUSED void gf_copy(gf out, const gf a)
+{
+    *out = *a;
+}

-void gf_strong_reduce (gf inout);   
-void gf_add (gf out, const gf a, const gf b);
-void gf_sub (gf out, const gf a, const gf b);
-void gf_mul (gf_s *__restrict__ out, const gf a, const gf b);
-void gf_mulw_unsigned (gf_s *__restrict__ out, const gf a, uint32_t b);
-void gf_sqr (gf_s *__restrict__ out, const gf a);
+static INLINE_UNUSED void gf_add_RAW(gf out, const gf a, const gf b);
+static INLINE_UNUSED void gf_sub_RAW(gf out, const gf a, const gf b);
+static INLINE_UNUSED void gf_bias(gf inout, int amount);
+static INLINE_UNUSED void gf_weak_reduce(gf inout);
+
+void gf_strong_reduce(gf inout);
+void gf_add(gf out, const gf a, const gf b);
+void gf_sub(gf out, const gf a, const gf b);
+void gf_mul(gf_s * __restrict__ out, const gf a, const gf b);
+void gf_mulw_unsigned(gf_s * __restrict__ out, const gf a, uint32_t b);
+void gf_sqr(gf_s * __restrict__ out, const gf a);
 mask_t gf_isr(gf a, const gf x); /** a^2 x = 1, QNR, or 0 if x=0.  Return true if successful */
-mask_t gf_eq (const gf x, const gf y);
-mask_t gf_lobit (const gf x);
-mask_t gf_hibit (const gf x);
+mask_t gf_eq(const gf x, const gf y);
+mask_t gf_lobit(const gf x);
+mask_t gf_hibit(const gf x);

-void gf_serialize (uint8_t *serial, const gf x,int with_highbit);
-mask_t gf_deserialize (gf x, const uint8_t serial[SER_BYTES],int with_hibit,uint8_t hi_nmask);
+void gf_serialize(uint8_t *serial, const gf x, int with_highbit);
+mask_t gf_deserialize(gf x, const uint8_t serial[SER_BYTES], int with_hibit,
+                      uint8_t hi_nmask);


 #ifdef __cplusplus
 } /* extern "C" */
 #endif

-#include "f_impl.h" /* Bring in the inline implementations */
+# include "f_impl.h"            /* Bring in the inline implementations */

-#ifndef LIMBPERM
-  #define LIMBPERM(i) (i)
-#endif
-#define LIMB_MASK(i) (((1)<<LIMB_PLACE_VALUE(i))-1)
+# ifndef LIMBPERM
+#  define LIMBPERM(i) (i)
+# endif
+# define LIMB_MASK(i) (((1)<<LIMB_PLACE_VALUE(i))-1)

-static const gf ZERO = {{{0}}}, ONE = {{{1}}};
+static const gf ZERO = { {{0}} }, ONE = { { {
+1}}};

-#endif /* __P448_F_FIELD_H__ */
+#endif                          /* __P448_F_FIELD_H__ */
--- a/crypto/ec/curve448/f_generic.c
+++ b/crypto/ec/curve448/f_generic.c
@ -11,24 +11,29 @@
 */
 #include "field.h"

-static const gf MODULUS = {FIELD_LITERAL(
-    0xffffffffffffff, 0xffffffffffffff, 0xffffffffffffff, 0xffffffffffffff, 0xfffffffffffffe, 0xffffffffffffff, 0xffffffffffffff, 0xffffffffffffff
-)};
+static const gf MODULUS =
+    { FIELD_LITERAL(0xffffffffffffff, 0xffffffffffffff, 0xffffffffffffff,
+                    0xffffffffffffff, 0xfffffffffffffe, 0xffffffffffffff,
+                    0xffffffffffffff, 0xffffffffffffff)
+};

 /** Serialize to wire format. */
-void gf_serialize (uint8_t serial[SER_BYTES], const gf x, int with_hibit) {
-    unsigned int j=0, fill=0;
+void gf_serialize(uint8_t serial[SER_BYTES], const gf x, int with_hibit)
+{
+    unsigned int j = 0, fill = 0;
    dword_t buffer = 0;
    unsigned int i;
    gf red;

    gf_copy(red, x);
    gf_strong_reduce(red);
-    if (!with_hibit) { assert(gf_hibit(red) == 0); }
+    if (!with_hibit) {
+        assert(gf_hibit(red) == 0);
+    }

-    UNROLL for (i=0; i<(with_hibit ? X_SER_BYTES : SER_BYTES); i++) {
+    UNROLL for (i = 0; i < (with_hibit ? X_SER_BYTES : SER_BYTES); i++) {
        if (fill < 8 && j < NLIMBS) {
-            buffer |= ((dword_t)red->limb[LIMBPERM(j)]) << fill;
+            buffer |= ((dword_t) red->limb[LIMBPERM(j)]) << fill;
            fill += LIMB_PLACE_VALUE(LIMBPERM(j));
            j++;
        }
@ -39,78 +44,90 @@ void gf_serialize (uint8_t serial[SER_BYTES], const gf x, int with_hibit) {
 }

 /** Return high bit of x = low bit of 2x mod p */
-mask_t gf_hibit(const gf x) {
+mask_t gf_hibit(const gf x)
+{
    gf y;
-    gf_add(y,x,x);
+    gf_add(y, x, x);
    gf_strong_reduce(y);
-    return -(y->limb[0]&1);
+    return -(y->limb[0] & 1);
 }

 /** Return high bit of x = low bit of 2x mod p */
-mask_t gf_lobit(const gf x) {
+mask_t gf_lobit(const gf x)
+{
    gf y;
-    gf_copy(y,x);
+    gf_copy(y, x);
    gf_strong_reduce(y);
-    return -(y->limb[0]&1);
+    return -(y->limb[0] & 1);
 }

 /** Deserialize from wire format; return -1 on success and 0 on failure. */
-mask_t gf_deserialize (gf x, const uint8_t serial[SER_BYTES], int with_hibit, uint8_t hi_nmask) {
-    unsigned int j=0, fill=0;
+mask_t gf_deserialize(gf x, const uint8_t serial[SER_BYTES], int with_hibit,
+                      uint8_t hi_nmask)
+{
+    unsigned int j = 0, fill = 0;
    dword_t buffer = 0;
    dsword_t scarry = 0;
    const unsigned nbytes = with_hibit ? X_SER_BYTES : SER_BYTES;
    unsigned int i;
    mask_t succ;

-    UNROLL for (i=0; i<NLIMBS; i++) {
+    UNROLL for (i = 0; i < NLIMBS; i++) {
        UNROLL while (fill < LIMB_PLACE_VALUE(LIMBPERM(i)) && j < nbytes) {
            uint8_t sj = serial[j];
-            if (j==nbytes-1) sj &= ~hi_nmask;
-            buffer |= ((dword_t)sj) << fill;
+            if (j == nbytes - 1)
+                sj &= ~hi_nmask;
+            buffer |= ((dword_t) sj) << fill;
            fill += 8;
            j++;
        }
-        x->limb[LIMBPERM(i)] = (i<NLIMBS-1) ? buffer & LIMB_MASK(LIMBPERM(i)) : buffer;
+        x->limb[LIMBPERM(i)] =
+            (i < NLIMBS - 1) ? buffer & LIMB_MASK(LIMBPERM(i)) : buffer;
        fill -= LIMB_PLACE_VALUE(LIMBPERM(i));
        buffer >>= LIMB_PLACE_VALUE(LIMBPERM(i));
-        scarry = (scarry + x->limb[LIMBPERM(i)] - MODULUS->limb[LIMBPERM(i)]) >> (8*sizeof(word_t));
+        scarry =
+            (scarry + x->limb[LIMBPERM(i)] -
+             MODULUS->limb[LIMBPERM(i)]) >> (8 * sizeof(word_t));
    }
-    succ = with_hibit ? -(mask_t)1 : ~gf_hibit(x);
+    succ = with_hibit ? -(mask_t) 1 : ~gf_hibit(x);
    return succ & word_is_zero(buffer) & ~word_is_zero(scarry);
 }

 /** Reduce to canonical form. */
-void gf_strong_reduce (gf a) {
+void gf_strong_reduce(gf a)
+{
    dsword_t scarry;
    word_t scarry_0;
    dword_t carry = 0;
    unsigned int i;

    /* first, clear high */
-    gf_weak_reduce(a); /* Determined to have negligible perf impact. */
+    gf_weak_reduce(a);          /* Determined to have negligible perf impact. */

    /* now the total is less than 2p */

    /* compute total_value - p.  No need to reduce mod p. */
    scarry = 0;
-    for (i=0; i<NLIMBS; i++) {
+    for (i = 0; i < NLIMBS; i++) {
        scarry = scarry + a->limb[LIMBPERM(i)] - MODULUS->limb[LIMBPERM(i)];
        a->limb[LIMBPERM(i)] = scarry & LIMB_MASK(LIMBPERM(i));
        scarry >>= LIMB_PLACE_VALUE(LIMBPERM(i));
    }

-    /* uncommon case: it was >= p, so now scarry = 0 and this = x
-     * common case: it was < p, so now scarry = -1 and this = x - p + 2^255
-     * so let's add back in p.  will carry back off the top for 2^255.
+    /*
+     * uncommon case: it was >= p, so now scarry = 0 and this = x common case:
+     * it was < p, so now scarry = -1 and this = x - p + 2^255 so let's add
+     * back in p.  will carry back off the top for 2^255.
     */
-    assert(word_is_zero(scarry) | word_is_zero(scarry+1));
+    assert(word_is_zero(scarry) | word_is_zero(scarry + 1));

    scarry_0 = scarry;

    /* add it back */
-    for (i=0; i<NLIMBS; i++) {
-        carry = carry + a->limb[LIMBPERM(i)] + (scarry_0 & MODULUS->limb[LIMBPERM(i)]);
+    for (i = 0; i < NLIMBS; i++) {
+        carry =
+            carry + a->limb[LIMBPERM(i)] +
+            (scarry_0 & MODULUS->limb[LIMBPERM(i)]);
        a->limb[LIMBPERM(i)] = carry & LIMB_MASK(LIMBPERM(i));
        carry >>= LIMB_PLACE_VALUE(LIMBPERM(i));
    }
@ -119,28 +136,31 @@ void gf_strong_reduce (gf a) {
 }

 /** Subtract two gf elements d=a-b */
-void gf_sub (gf d, const gf a, const gf b) {
-    gf_sub_RAW ( d, a, b );
-    gf_bias( d, 2 );
-    gf_weak_reduce ( d );
+void gf_sub(gf d, const gf a, const gf b)
+{
+    gf_sub_RAW(d, a, b);
+    gf_bias(d, 2);
+    gf_weak_reduce(d);
 }

 /** Add two field elements d = a+b */
-void gf_add (gf d, const gf a, const gf b) {
-    gf_add_RAW ( d, a, b );
-    gf_weak_reduce ( d );
+void gf_add(gf d, const gf a, const gf b)
+{
+    gf_add_RAW(d, a, b);
+    gf_weak_reduce(d);
 }

 /** Compare a==b */
-mask_t gf_eq(const gf a, const gf b) {
+mask_t gf_eq(const gf a, const gf b)
+{
    gf c;
-    mask_t ret=0;
+    mask_t ret = 0;
    unsigned int i;

-    gf_sub(c,a,b);
+    gf_sub(c, a, b);
    gf_strong_reduce(c);

-    for (i=0; i<NLIMBS; i++) {
+    for (i = 0; i < NLIMBS; i++) {
        ret |= c->limb[LIMBPERM(i)];
    }

--- a/crypto/ec/curve448/field.h
+++ b/crypto/ec/curve448/field.h
@ -11,85 +11,90 @@
 */

 #ifndef __GF_H__
-#define __GF_H__
+# define __GF_H__
+
+# include "constant_time.h"
+# include "f_field.h"
+# include <string.h>

-#include "constant_time.h"
-#include "f_field.h"
-#include <string.h>
-    
 /** Square x, n times. */
-static ossl_inline void gf_sqrn (
-    gf_s *__restrict__ y,
-    const gf x,
-    int n
-) {
+static ossl_inline void gf_sqrn(gf_s * __restrict__ y, const gf x, int n)
+{
    gf tmp;
-    assert(n>0);
-    if (n&1) {
-        gf_sqr(y,x);
+    assert(n > 0);
+    if (n & 1) {
+        gf_sqr(y, x);
        n--;
    } else {
-        gf_sqr(tmp,x);
-        gf_sqr(y,tmp);
-        n-=2;
+        gf_sqr(tmp, x);
+        gf_sqr(y, tmp);
+        n -= 2;
    }
-    for (; n; n-=2) {
-        gf_sqr(tmp,y);
-        gf_sqr(y,tmp);
+    for (; n; n -= 2) {
+        gf_sqr(tmp, y);
+        gf_sqr(y, tmp);
    }
 }

-#define gf_add_nr gf_add_RAW
+# define gf_add_nr gf_add_RAW

 /** Subtract mod p.  Bias by 2 and don't reduce  */
-static ossl_inline void gf_sub_nr ( gf c, const gf a, const gf b ) {
-    gf_sub_RAW(c,a,b);
+static ossl_inline void gf_sub_nr(gf c, const gf a, const gf b)
+{
+    gf_sub_RAW(c, a, b);
    gf_bias(c, 2);
-    if (GF_HEADROOM < 3) gf_weak_reduce(c);
+    if (GF_HEADROOM < 3)
+        gf_weak_reduce(c);
 }

 /** Subtract mod p. Bias by amt but don't reduce.  */
-static ossl_inline void gf_subx_nr ( gf c, const gf a, const gf b, int amt ) {
-    gf_sub_RAW(c,a,b);
+static ossl_inline void gf_subx_nr(gf c, const gf a, const gf b, int amt)
+{
+    gf_sub_RAW(c, a, b);
    gf_bias(c, amt);
-    if (GF_HEADROOM < amt+1) gf_weak_reduce(c);
+    if (GF_HEADROOM < amt + 1)
+        gf_weak_reduce(c);
 }

 /** Mul by signed int.  Not constant-time WRT the sign of that int. */
-static ossl_inline void gf_mulw(gf c, const gf a, int32_t w) {
-    if (w>0) {
+static ossl_inline void gf_mulw(gf c, const gf a, int32_t w)
+{
+    if (w > 0) {
        gf_mulw_unsigned(c, a, w);
    } else {
        gf_mulw_unsigned(c, a, -w);
-        gf_sub(c,ZERO,c);
+        gf_sub(c, ZERO, c);
    }
 }

 /** Constant time, x = is_z ? z : y */
-static ossl_inline void gf_cond_sel(gf x, const gf y, const gf z, mask_t is_z) {
-    constant_time_select(x,y,z,sizeof(gf),is_z,0);
+static ossl_inline void gf_cond_sel(gf x, const gf y, const gf z, mask_t is_z)
+{
+    constant_time_select(x, y, z, sizeof(gf), is_z, 0);
 }

 /** Constant time, if (neg) x=-x; */
-static ossl_inline void gf_cond_neg(gf x, mask_t neg) {
+static ossl_inline void gf_cond_neg(gf x, mask_t neg)
+{
    gf y;
-    gf_sub(y,ZERO,x);
-    gf_cond_sel(x,x,y,neg);
+    gf_sub(y, ZERO, x);
+    gf_cond_sel(x, x, y, neg);
 }

 /** Constant time, if (swap) (x,y) = (y,x); */
-static ossl_inline void
-gf_cond_swap(gf x, gf_s *__restrict__ y, mask_t swap) {
-    constant_time_cond_swap(x,y,sizeof(gf_s),swap);
+static ossl_inline void gf_cond_swap(gf x, gf_s * __restrict__ y, mask_t swap)
+{
+    constant_time_cond_swap(x, y, sizeof(gf_s), swap);
 }

-static ossl_inline void gf_mul_qnr(gf_s *__restrict__ out, const gf x) {
-    gf_sub(out,ZERO,x);
+static ossl_inline void gf_mul_qnr(gf_s * __restrict__ out, const gf x)
+{
+    gf_sub(out, ZERO, x);
 }

-static ossl_inline void gf_div_qnr(gf_s *__restrict__ out, const gf x) {
-    gf_sub(out,ZERO,x);
+static ossl_inline void gf_div_qnr(gf_s * __restrict__ out, const gf x)
+{
+    gf_sub(out, ZERO, x);
 }

-
-#endif /* __GF_H__ */
+#endif                          /* __GF_H__ */
--- a/crypto/ec/curve448/point_448.h
+++ b/crypto/ec/curve448/point_448.h
@ -11,52 +11,52 @@
 */

 #ifndef __DECAF_POINT_448_H__
-#define __DECAF_POINT_448_H__ 1
+# define __DECAF_POINT_448_H__ 1

-#include "curve448utils.h"
-#include "field.h"
+# include "curve448utils.h"
+# include "field.h"

 #ifdef __cplusplus
 extern "C" {
 #endif

 /** @cond internal */
-#define DECAF_448_SCALAR_LIMBS ((446-1)/DECAF_WORD_BITS+1)
+# define DECAF_448_SCALAR_LIMBS ((446-1)/DECAF_WORD_BITS+1)
 /** @endcond */

 /** The number of bits in a scalar */
-#define DECAF_448_SCALAR_BITS 446
+# define DECAF_448_SCALAR_BITS 446

 /** Number of bytes in a serialized point. */
-#define DECAF_448_SER_BYTES 56
+# define DECAF_448_SER_BYTES 56

 /** Number of bytes in an elligated point.  For now set the same as SER_BYTES
 * but could be different for other curves.
 */
-#define DECAF_448_HASH_BYTES 56
+# define DECAF_448_HASH_BYTES 56

 /** Number of bytes in a serialized scalar. */
-#define DECAF_448_SCALAR_BYTES 56
+# define DECAF_448_SCALAR_BYTES 56

 /** Number of bits in the "which" field of an elligator inverse */
-#define DECAF_448_INVERT_ELLIGATOR_WHICH_BITS 3
+# define DECAF_448_INVERT_ELLIGATOR_WHICH_BITS 3

 /** The cofactor the curve would have, if we hadn't removed it */
-#define DECAF_448_REMOVED_COFACTOR 4
+# define DECAF_448_REMOVED_COFACTOR 4

 /** X448 encoding ratio. */
-#define DECAF_X448_ENCODE_RATIO 2
+# define DECAF_X448_ENCODE_RATIO 2

 /** Number of bytes in an x448 public key */
-#define DECAF_X448_PUBLIC_BYTES 56
+# define DECAF_X448_PUBLIC_BYTES 56

 /** Number of bytes in an x448 private key */
-#define DECAF_X448_PRIVATE_BYTES 56
+# define DECAF_X448_PRIVATE_BYTES 56

 /** Twisted Edwards extended homogeneous coordinates */
 typedef struct curve448_point_s {
    /** @cond internal */
-    gf_448_t x,y,z,t;
+    gf_448_t x, y, z, t;
    /** @endcond */
 } curve448_point_t[1];

@ -64,7 +64,7 @@ typedef struct curve448_point_s {
 struct curve448_precomputed_s;

 /** Precomputed table based on a point.  Can be trivial implementation. */
-typedef struct curve448_precomputed_s curve448_precomputed_s; 
+typedef struct curve448_precomputed_s curve448_precomputed_s;

 /** Scalar is stored packed, because we don't need the speed. */
 typedef struct curve448_scalar_s {
@ -98,10 +98,10 @@ extern const struct curve448_precomputed_s *curve448_precomputed_base;
 * @retval DECAF_FAILURE The scalar was greater than the modulus,
 * and has been reduced modulo that modulus.
 */
-__owur decaf_error_t curve448_scalar_decode (
-    curve448_scalar_t out,
-    const unsigned char ser[DECAF_448_SCALAR_BYTES]
-);
+__owur decaf_error_t curve448_scalar_decode(curve448_scalar_t out,
+                                            const unsigned char
+                                            ser[DECAF_448_SCALAR_BYTES]
+    );

 /**
 * @brief Read a scalar from wire format or from bytes.  Reduces mod
@ -111,68 +111,51 @@ __owur decaf_error_t curve448_scalar_decode (
 * @param [in] ser_len Length of serialized form.
 * @param [out] out Deserialized form.
 */
-void curve448_scalar_decode_long (
-    curve448_scalar_t out,
-    const unsigned char *ser,
-    size_t ser_len
-);
-    
+void curve448_scalar_decode_long(curve448_scalar_t out,
+                                 const unsigned char *ser, size_t ser_len);
+
 /**
 * @brief Serialize a scalar to wire format.
 *
 * @param [out] ser Serialized form of a scalar.
 * @param [in] s Deserialized scalar.
 */
-void curve448_scalar_encode (
-    unsigned char ser[DECAF_448_SCALAR_BYTES],
-    const curve448_scalar_t s
-);
-        
+void curve448_scalar_encode(unsigned char ser[DECAF_448_SCALAR_BYTES],
+                            const curve448_scalar_t s);
+
 /**
 * @brief Add two scalars.  The scalars may use the same memory.
 * @param [in] a One scalar.
 * @param [in] b Another scalar.
 * @param [out] out a+b.
 */
-void curve448_scalar_add (
-    curve448_scalar_t out,
-    const curve448_scalar_t a,
-    const curve448_scalar_t b
-);
+void curve448_scalar_add(curve448_scalar_t out,
+                         const curve448_scalar_t a, const curve448_scalar_t b);

 /**
 * @brief Subtract two scalars.  The scalars may use the same memory.
 * @param [in] a One scalar.
 * @param [in] b Another scalar.
 * @param [out] out a-b.
- */  
-void curve448_scalar_sub (
-    curve448_scalar_t out,
-    const curve448_scalar_t a,
-    const curve448_scalar_t b
-);
+ */
+void curve448_scalar_sub(curve448_scalar_t out,
+                         const curve448_scalar_t a, const curve448_scalar_t b);

 /**
 * @brief Multiply two scalars.  The scalars may use the same memory.
 * @param [in] a One scalar.
 * @param [in] b Another scalar.
 * @param [out] out a*b.
- */  
-void curve448_scalar_mul (
-    curve448_scalar_t out,
-    const curve448_scalar_t a,
-    const curve448_scalar_t b
-);
-        
+ */
+void curve448_scalar_mul(curve448_scalar_t out,
+                         const curve448_scalar_t a, const curve448_scalar_t b);
+
 /**
 * @brief Halve a scalar.  The scalars may use the same memory.
 * @param [in] a A scalar.
 * @param [out] out a/2.
 */
-void curve448_scalar_halve (
-   curve448_scalar_t out,
-   const curve448_scalar_t a
-);
+void curve448_scalar_halve(curve448_scalar_t out, const curve448_scalar_t a);

 /**
 * @brief Copy a scalar.  The scalars may use the same memory, in which
@ -180,10 +163,9 @@ void curve448_scalar_halve (
 * @param [in] a A scalar.
 * @param [out] out Will become a copy of a.
 */
-static ossl_inline void curve448_scalar_copy (
-    curve448_scalar_t out,
-    const curve448_scalar_t a
-) {
+static ossl_inline void curve448_scalar_copy(curve448_scalar_t out,
+                                             const curve448_scalar_t a)
+{
    *out = *a;
 }

@ -194,11 +176,10 @@ static ossl_inline void curve448_scalar_copy (
 * @param [out] a A copy of the point.
 * @param [in] b Any point.
 */
-static ossl_inline void curve448_point_copy (
-    curve448_point_t a,
-    const curve448_point_t b
-) {
-    *a=*b;
+static ossl_inline void curve448_point_copy(curve448_point_t a,
+                                            const curve448_point_t b)
+{
+    *a = *b;
 }

 /**
@ -210,10 +191,8 @@ static ossl_inline void curve448_point_copy (
 * @retval DECAF_TRUE The points are equal.
 * @retval DECAF_FALSE The points are not equal.
 */
-__owur decaf_bool_t curve448_point_eq (
-    const curve448_point_t a,
-    const curve448_point_t b
-);
+__owur decaf_bool_t curve448_point_eq(const curve448_point_t a,
+                                      const curve448_point_t b);

 /**
 * @brief Double a point.  Equivalent to
@ -222,10 +201,7 @@ __owur decaf_bool_t curve448_point_eq (
 * @param [out] two_a The sum a+a.
 * @param [in] a A point.
 */
-void curve448_point_double (
-    curve448_point_t two_a,
-    const curve448_point_t a
-);
+void curve448_point_double(curve448_point_t two_a, const curve448_point_t a);

 /**
 * @brief RFC 7748 Diffie-Hellman scalarmul.  This function uses a different
@ -239,11 +215,10 @@ void curve448_point_double (
 * @retval DECAF_FAILURE The scalarmul didn't succeed, because the base
 * point is in a small subgroup.
 */
-__owur decaf_error_t decaf_x448 (
-    uint8_t out[DECAF_X448_PUBLIC_BYTES],
-    const uint8_t base[DECAF_X448_PUBLIC_BYTES],
-    const uint8_t scalar[DECAF_X448_PRIVATE_BYTES]
-);
+__owur decaf_error_t decaf_x448(uint8_t out[DECAF_X448_PUBLIC_BYTES],
+                                const uint8_t base[DECAF_X448_PUBLIC_BYTES],
+                                const uint8_t scalar[DECAF_X448_PRIVATE_BYTES]
+    );

 /**
 * @brief Multiply a point by DECAF_X448_ENCODE_RATIO,
@ -265,14 +240,14 @@ __owur decaf_error_t decaf_x448 (
 * @param [out] out The scaled and encoded point.
 * @param [in] p The point to be scaled and encoded.
 */
-void curve448_point_mul_by_ratio_and_encode_like_x448 (
-    uint8_t out[DECAF_X448_PUBLIC_BYTES],
-    const curve448_point_t p
-);
+void curve448_point_mul_by_ratio_and_encode_like_x448(uint8_t
+                                                      out
+                                                      [DECAF_X448_PUBLIC_BYTES],
+                                                      const curve448_point_t p);

 /** The base point for X448 Diffie-Hellman */
 extern const uint8_t decaf_x448_base_point[DECAF_X448_PUBLIC_BYTES];
-    
+
 /**
 * @brief RFC 7748 Diffie-Hellman base point scalarmul.  This function uses
 * a different (non-Decaf) encoding.
@ -283,11 +258,9 @@ extern const uint8_t decaf_x448_base_point[DECAF_X448_PUBLIC_BYTES];
 * @param [out] scaled The scaled point base*scalar
 * @param [in] scalar The scalar to multiply by.
 */
-void decaf_x448_derive_public_key (
-    uint8_t out[DECAF_X448_PUBLIC_BYTES],
-    const uint8_t scalar[DECAF_X448_PRIVATE_BYTES]
-);
-
+void decaf_x448_derive_public_key(uint8_t out[DECAF_X448_PUBLIC_BYTES],
+                                  const uint8_t scalar[DECAF_X448_PRIVATE_BYTES]
+    );

 /**
 * @brief Multiply a precomputed base point by a scalar:
@ -300,12 +273,9 @@ void decaf_x448_derive_public_key (
 * @param [in] base The point to be scaled.
 * @param [in] scalar The scalar to multiply by.
 */
-void curve448_precomputed_scalarmul (
-    curve448_point_t scaled,
-    const curve448_precomputed_s *base,
-    const curve448_scalar_t scalar
-);
-
+void curve448_precomputed_scalarmul(curve448_point_t scaled,
+                                    const curve448_precomputed_s * base,
+                                    const curve448_scalar_t scalar);

 /**
 * @brief Multiply two base points by two scalars:
@ -322,12 +292,10 @@ void curve448_precomputed_scalarmul (
 * @warning: This function takes variable time, and may leak the scalars
 * used.  It is designed for signature verification.
 */
-void curve448_base_double_scalarmul_non_secret (
-    curve448_point_t combo,
-    const curve448_scalar_t scalar1,
-    const curve448_point_t base2,
-    const curve448_scalar_t scalar2
-);
+void curve448_base_double_scalarmul_non_secret(curve448_point_t combo,
+                                               const curve448_scalar_t scalar1,
+                                               const curve448_point_t base2,
+                                               const curve448_scalar_t scalar2);

 /**
 * @brief Test that a point is valid, for debugging purposes.
@ -336,26 +304,20 @@ void curve448_base_double_scalarmul_non_secret (
 * @retval DECAF_TRUE The point is valid.
 * @retval DECAF_FALSE The point is invalid.
 */
-__owur decaf_bool_t curve448_point_valid (
-    const curve448_point_t to_test
-);
+__owur decaf_bool_t curve448_point_valid(const curve448_point_t to_test);

 /**
 * @brief Overwrite scalar with zeros.
 */
-void curve448_scalar_destroy (
-    curve448_scalar_t scalar
-);
+void curve448_scalar_destroy(curve448_scalar_t scalar);

 /**
 * @brief Overwrite point with zeros.
 */
-void curve448_point_destroy (
-    curve448_point_t point
-);
+void curve448_point_destroy(curve448_point_t point);

 #ifdef __cplusplus
 } /* extern "C" */
 #endif

-#endif /* __DECAF_POINT_448_H__ */
+#endif                          /* __DECAF_POINT_448_H__ */
--- a/crypto/ec/curve448/scalar.c
+++ b/crypto/ec/curve448/scalar.c
@ -15,110 +15,114 @@
 #include "constant_time.h"
 #include "point_448.h"

-static const decaf_word_t MONTGOMERY_FACTOR = (decaf_word_t)0x3bd440fae918bc5;
-static const curve448_scalar_t sc_p = {{{
-    SC_LIMB(0x2378c292ab5844f3), SC_LIMB(0x216cc2728dc58f55), SC_LIMB(0xc44edb49aed63690), SC_LIMB(0xffffffff7cca23e9), SC_LIMB(0xffffffffffffffff), SC_LIMB(0xffffffffffffffff), SC_LIMB(0x3fffffffffffffff)
-}}}, sc_r2 = {{{
-    SC_LIMB(0xe3539257049b9b60), SC_LIMB(0x7af32c4bc1b195d9), SC_LIMB(0x0d66de2388ea1859), SC_LIMB(0xae17cf725ee4d838), SC_LIMB(0x1a9cc14ba3c47c44), SC_LIMB(0x2052bcb7e4d070af), SC_LIMB(0x3402a939f823b729)
+static const decaf_word_t MONTGOMERY_FACTOR = (decaf_word_t) 0x3bd440fae918bc5;
+static const curve448_scalar_t sc_p = { {{
+                                          SC_LIMB(0x2378c292ab5844f3),
+                                          SC_LIMB(0x216cc2728dc58f55),
+                                          SC_LIMB(0xc44edb49aed63690),
+                                          SC_LIMB(0xffffffff7cca23e9),
+                                          SC_LIMB(0xffffffffffffffff),
+                                          SC_LIMB(0xffffffffffffffff),
+                                          SC_LIMB(0x3fffffffffffffff)
+                                          }}
+}, sc_r2 = { { {
+
+            SC_LIMB(0xe3539257049b9b60), SC_LIMB(0x7af32c4bc1b195d9),
+                SC_LIMB(0x0d66de2388ea1859), SC_LIMB(0xae17cf725ee4d838),
+                SC_LIMB(0x1a9cc14ba3c47c44), SC_LIMB(0x2052bcb7e4d070af),
+                SC_LIMB(0x3402a939f823b729)
 }}};
+
 /* End of template stuff */

-#define WBITS DECAF_WORD_BITS /* NB this may be different from ARCH_WORD_BITS */
+#define WBITS DECAF_WORD_BITS   /* NB this may be different from ARCH_WORD_BITS */

-const curve448_scalar_t curve448_scalar_one = {{{1}}}, curve448_scalar_zero = {{{0}}};
+const curve448_scalar_t curve448_scalar_one = { {{1}} }, curve448_scalar_zero = { { {
+0}}};

 /** {extra,accum} - sub +? p
 * Must have extra <= 1
 */
-static void sc_subx(
-    curve448_scalar_t out,
-    const decaf_word_t accum[DECAF_448_SCALAR_LIMBS],
-    const curve448_scalar_t sub,
-    const curve448_scalar_t p,
-    decaf_word_t extra
-) {
+static void sc_subx(curve448_scalar_t out,
+                    const decaf_word_t accum[DECAF_448_SCALAR_LIMBS],
+                    const curve448_scalar_t sub,
+                    const curve448_scalar_t p, decaf_word_t extra)
+{
    decaf_dsword_t chain = 0;
    unsigned int i;
    decaf_word_t borrow;

-    for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) {
+    for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
        chain = (chain + accum[i]) - sub->limb[i];
        out->limb[i] = chain;
        chain >>= WBITS;
    }
-    borrow = chain+extra; /* = 0 or -1 */
-    
+    borrow = chain + extra;     /* = 0 or -1 */
+
    chain = 0;
-    for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) {
+    for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
        chain = (chain + out->limb[i]) + (p->limb[i] & borrow);
        out->limb[i] = chain;
        chain >>= WBITS;
    }
 }

-static void sc_montmul (
-    curve448_scalar_t out,
-    const curve448_scalar_t a,
-    const curve448_scalar_t b
-) {
-    unsigned int i,j;
-    decaf_word_t accum[DECAF_448_SCALAR_LIMBS+1] = {0};
+static void sc_montmul(curve448_scalar_t out,
+                       const curve448_scalar_t a, const curve448_scalar_t b)
+{
+    unsigned int i, j;
+    decaf_word_t accum[DECAF_448_SCALAR_LIMBS + 1] = { 0 };
    decaf_word_t hi_carry = 0;
-    
-    for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) {
+
+    for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
        decaf_word_t mand = a->limb[i];
        const decaf_word_t *mier = b->limb;
-        
+
        decaf_dword_t chain = 0;
-        for (j=0; j<DECAF_448_SCALAR_LIMBS; j++) {
-            chain += ((decaf_dword_t)mand)*mier[j] + accum[j];
+        for (j = 0; j < DECAF_448_SCALAR_LIMBS; j++) {
+            chain += ((decaf_dword_t) mand) * mier[j] + accum[j];
            accum[j] = chain;
            chain >>= WBITS;
        }
        accum[j] = chain;
-        
+
        mand = accum[0] * MONTGOMERY_FACTOR;
        chain = 0;
        mier = sc_p->limb;
-        for (j=0; j<DECAF_448_SCALAR_LIMBS; j++) {
-            chain += (decaf_dword_t)mand*mier[j] + accum[j];
-            if (j) accum[j-1] = chain;
+        for (j = 0; j < DECAF_448_SCALAR_LIMBS; j++) {
+            chain += (decaf_dword_t) mand *mier[j] + accum[j];
+            if (j)
+                accum[j - 1] = chain;
            chain >>= WBITS;
        }
        chain += accum[j];
        chain += hi_carry;
-        accum[j-1] = chain;
+        accum[j - 1] = chain;
        hi_carry = chain >> WBITS;
    }
-    
+
    sc_subx(out, accum, sc_p, sc_p, hi_carry);
 }

-void curve448_scalar_mul (
-    curve448_scalar_t out,
-    const curve448_scalar_t a,
-    const curve448_scalar_t b
-) {
-    sc_montmul(out,a,b);
-    sc_montmul(out,out,sc_r2);
+void curve448_scalar_mul(curve448_scalar_t out,
+                         const curve448_scalar_t a, const curve448_scalar_t b)
+{
+    sc_montmul(out, a, b);
+    sc_montmul(out, out, sc_r2);
 }

-void curve448_scalar_sub (
-    curve448_scalar_t out,
-    const curve448_scalar_t a,
-    const curve448_scalar_t b
-) {
+void curve448_scalar_sub(curve448_scalar_t out,
+                         const curve448_scalar_t a, const curve448_scalar_t b)
+{
    sc_subx(out, a->limb, b, sc_p, 0);
 }

-void curve448_scalar_add (
-    curve448_scalar_t out,
-    const curve448_scalar_t a,
-    const curve448_scalar_t b
-) {
+void curve448_scalar_add(curve448_scalar_t out,
+                         const curve448_scalar_t a, const curve448_scalar_t b)
+{
    decaf_dword_t chain = 0;
    unsigned int i;
-    for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) {
+    for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
        chain = (chain + a->limb[i]) + b->limb[i];
        out->limb[i] = chain;
        chain >>= WBITS;
@ -126,50 +130,47 @@ void curve448_scalar_add (
    sc_subx(out, out->limb, sc_p, sc_p, chain);
 }

-static ossl_inline void scalar_decode_short (
-    curve448_scalar_t s,
-    const unsigned char *ser,
-    unsigned int nbytes
-) {
-    unsigned int i,j,k=0;
-    for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) {
+static ossl_inline void scalar_decode_short(curve448_scalar_t s,
+                                            const unsigned char *ser,
+                                            unsigned int nbytes)
+{
+    unsigned int i, j, k = 0;
+    for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
        decaf_word_t out = 0;
-        for (j=0; j<sizeof(decaf_word_t) && k<nbytes; j++,k++) {
-            out |= ((decaf_word_t)ser[k])<<(8*j);
+        for (j = 0; j < sizeof(decaf_word_t) && k < nbytes; j++, k++) {
+            out |= ((decaf_word_t) ser[k]) << (8 * j);
        }
        s->limb[i] = out;
    }
 }

-decaf_error_t curve448_scalar_decode(
-    curve448_scalar_t s,
-    const unsigned char ser[DECAF_448_SCALAR_BYTES]
-) {
+decaf_error_t curve448_scalar_decode(curve448_scalar_t s,
+                                     const unsigned char
+                                     ser[DECAF_448_SCALAR_BYTES]
+    )
+{
    unsigned int i;
    decaf_dsword_t accum = 0;

    scalar_decode_short(s, ser, DECAF_448_SCALAR_BYTES);
-    for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) {
+    for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
        accum = (accum + s->limb[i] - sc_p->limb[i]) >> WBITS;
    }
    /* Here accum == 0 or -1 */
-    
-    curve448_scalar_mul(s,s,curve448_scalar_one); /* ham-handed reduce */
-    
+
+    curve448_scalar_mul(s, s, curve448_scalar_one); /* ham-handed reduce */
+
    return decaf_succeed_if(~word_is_zero(accum));
 }

-void curve448_scalar_destroy (
-    curve448_scalar_t scalar
-) {
+void curve448_scalar_destroy(curve448_scalar_t scalar)
+{
    OPENSSL_cleanse(scalar, sizeof(curve448_scalar_t));
 }

-void curve448_scalar_decode_long(
-    curve448_scalar_t s,
-    const unsigned char *ser,
-    size_t ser_len
-) {
+void curve448_scalar_decode_long(curve448_scalar_t s,
+                                 const unsigned char *ser, size_t ser_len)
+{
    size_t i;
    curve448_scalar_t t1, t2;

@ -178,23 +179,24 @@ void curve448_scalar_decode_long(
        return;
    }

-    i = ser_len - (ser_len%DECAF_448_SCALAR_BYTES);
-    if (i==ser_len) i -= DECAF_448_SCALAR_BYTES;
-    
-    scalar_decode_short(t1, &ser[i], ser_len-i);
+    i = ser_len - (ser_len % DECAF_448_SCALAR_BYTES);
+    if (i == ser_len)
+        i -= DECAF_448_SCALAR_BYTES;
+
+    scalar_decode_short(t1, &ser[i], ser_len - i);

    if (ser_len == sizeof(curve448_scalar_t)) {
-        assert(i==0);
+        assert(i == 0);
        /* ham-handed reduce */
-        curve448_scalar_mul(s,t1,curve448_scalar_one);
+        curve448_scalar_mul(s, t1, curve448_scalar_one);
        curve448_scalar_destroy(t1);
        return;
    }

    while (i) {
        i -= DECAF_448_SCALAR_BYTES;
-        sc_montmul(t1,t1,sc_r2);
-        ignore_result( curve448_scalar_decode(t2, ser+i) );
+        sc_montmul(t1, t1, sc_r2);
+        ignore_result(curve448_scalar_decode(t2, ser + i));
        curve448_scalar_add(t1, t1, t2);
    }

@ -203,33 +205,29 @@ void curve448_scalar_decode_long(
    curve448_scalar_destroy(t2);
 }

-void curve448_scalar_encode(
-    unsigned char ser[DECAF_448_SCALAR_BYTES],
-    const curve448_scalar_t s
-) {
-    unsigned int i,j,k=0;
-    for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) {
-        for (j=0; j<sizeof(decaf_word_t); j++,k++) {
-            ser[k] = s->limb[i] >> (8*j);
+void curve448_scalar_encode(unsigned char ser[DECAF_448_SCALAR_BYTES],
+                            const curve448_scalar_t s)
+{
+    unsigned int i, j, k = 0;
+    for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
+        for (j = 0; j < sizeof(decaf_word_t); j++, k++) {
+            ser[k] = s->limb[i] >> (8 * j);
        }
    }
 }

-void curve448_scalar_halve (
-    curve448_scalar_t out,
-    const curve448_scalar_t a
-) {
+void curve448_scalar_halve(curve448_scalar_t out, const curve448_scalar_t a)
+{
    decaf_word_t mask = -(a->limb[0] & 1);
    decaf_dword_t chain = 0;
    unsigned int i;
-    for (i=0; i<DECAF_448_SCALAR_LIMBS; i++) {
+    for (i = 0; i < DECAF_448_SCALAR_LIMBS; i++) {
        chain = (chain + a->limb[i]) + (sc_p->limb[i] & mask);
        out->limb[i] = chain;
        chain >>= DECAF_WORD_BITS;
    }
-    for (i=0; i<DECAF_448_SCALAR_LIMBS-1; i++) {
-        out->limb[i] = out->limb[i]>>1 | out->limb[i+1]<<(WBITS-1);
+    for (i = 0; i < DECAF_448_SCALAR_LIMBS - 1; i++) {
+        out->limb[i] = out->limb[i] >> 1 | out->limb[i + 1] << (WBITS - 1);
    }
-    out->limb[i] = out->limb[i]>>1 | chain<<(WBITS-1);
+    out->limb[i] = out->limb[i] >> 1 | chain << (WBITS - 1);
 }
-
--- a/crypto/ec/curve448/word.h
+++ b/crypto/ec/curve448/word.h
@ -11,208 +11,212 @@
 */

 #ifndef __WORD_H__
-#define __WORD_H__
+# define __WORD_H__

-#include <string.h>
+# include <string.h>

-#include <assert.h>
-#include <openssl/e_os2.h>
-#include "arch_intrinsics.h"
+# include <assert.h>
+# include <openssl/e_os2.h>
+# include "arch_intrinsics.h"

-#include "curve448utils.h"
+# include "curve448utils.h"

-#ifndef _BSD_SOURCE
-#define _BSD_SOURCE 1
-#endif
+# ifndef _BSD_SOURCE
+#  define _BSD_SOURCE 1
+# endif

-#ifndef _DEFAULT_SOURCE
-#define _DEFAULT_SOURCE 1
-#endif
+# ifndef _DEFAULT_SOURCE
+#  define _DEFAULT_SOURCE 1
+# endif

-#include <stdlib.h>
+# include <stdlib.h>

-#if defined(__ARM_NEON__)
-#include <arm_neon.h>
-#elif defined(__SSE2__)
-    #if !defined(__GNUC__) || defined(__clang__) || __GNUC__ >= 5 || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
-        #include <immintrin.h>
-    #else
-        #include <emmintrin.h>
-    #endif
-#endif
+# if defined(__ARM_NEON__)
+#  include <arm_neon.h>
+# elif defined(__SSE2__)
+#  if !defined(__GNUC__) || defined(__clang__) || __GNUC__ >= 5 || (__GNUC__==4 && __GNUC_MINOR__ >= 4)
+#   include <immintrin.h>
+#  else
+#   include <emmintrin.h>
+#  endif
+# endif

-#if (ARCH_WORD_BITS == 64)
-    typedef uint64_t word_t, mask_t;
-    typedef __uint128_t dword_t;
-    typedef int32_t hsword_t;
-    typedef int64_t sword_t;
-    typedef __int128_t dsword_t;
-#elif (ARCH_WORD_BITS == 32)
-    typedef uint32_t word_t, mask_t;
-    typedef uint64_t dword_t;
-    typedef int16_t hsword_t;
-    typedef int32_t sword_t;
-    typedef int64_t dsword_t;
-#else
-    #error "For now, libdecaf only supports 32- and 64-bit architectures."
-#endif
-    
-/* Scalar limbs are keyed off of the API word size instead of the arch word size. */
-#if DECAF_WORD_BITS == 64
-    #define SC_LIMB(x) (x)
-#elif DECAF_WORD_BITS == 32
-    #define SC_LIMB(x) ((uint32_t)x),(x>>32)
-#else
-    #error "For now, libdecaf only supports 32- and 64-bit architectures."
-#endif
+# if (ARCH_WORD_BITS == 64)
+typedef uint64_t word_t, mask_t;
+typedef __uint128_t dword_t;
+typedef int32_t hsword_t;
+typedef int64_t sword_t;
+typedef __int128_t dsword_t;
+# elif (ARCH_WORD_BITS == 32)
+typedef uint32_t word_t, mask_t;
+typedef uint64_t dword_t;
+typedef int16_t hsword_t;
+typedef int32_t sword_t;
+typedef int64_t dsword_t;
+# else
+#  error "For now, libdecaf only supports 32- and 64-bit architectures."
+# endif

-#ifdef __ARM_NEON__
-    typedef uint32x4_t vecmask_t;
-#elif defined(__clang__)
-    typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
-    typedef int64_t  int64x2_t __attribute__((ext_vector_type(2)));
-    typedef uint64_t uint64x4_t __attribute__((ext_vector_type(4)));
-    typedef int64_t  int64x4_t __attribute__((ext_vector_type(4)));
-    typedef uint32_t uint32x4_t __attribute__((ext_vector_type(4)));
-    typedef int32_t  int32x4_t __attribute__((ext_vector_type(4)));
-    typedef uint32_t uint32x2_t __attribute__((ext_vector_type(2)));
-    typedef int32_t  int32x2_t __attribute__((ext_vector_type(2)));
-    typedef uint32_t uint32x8_t __attribute__((ext_vector_type(8)));
-    typedef int32_t  int32x8_t __attribute__((ext_vector_type(8)));
-    typedef word_t vecmask_t __attribute__((ext_vector_type(4)));
-#else /* GCC, hopefully? */
-    typedef uint64_t uint64x2_t __attribute__((vector_size(16)));
-    typedef int64_t  int64x2_t __attribute__((vector_size(16)));
-    typedef uint64_t uint64x4_t __attribute__((vector_size(32)));
-    typedef int64_t  int64x4_t __attribute__((vector_size(32)));
-    typedef uint32_t uint32x4_t __attribute__((vector_size(16)));
-    typedef int32_t  int32x4_t __attribute__((vector_size(16)));
-    typedef uint32_t uint32x2_t __attribute__((vector_size(8)));
-    typedef int32_t  int32x2_t __attribute__((vector_size(8)));
-    typedef uint32_t uint32x8_t __attribute__((vector_size(32)));
-    typedef int32_t  int32x8_t __attribute__((vector_size(32)));
-    typedef word_t vecmask_t __attribute__((vector_size(32)));
-#endif
+/*
+ * Scalar limbs are keyed off of the API word size instead of the arch word
+ * size.
+ */
+# if DECAF_WORD_BITS == 64
+#  define SC_LIMB(x) (x)
+# elif DECAF_WORD_BITS == 32
+#  define SC_LIMB(x) ((uint32_t)x),(x>>32)
+# else
+#  error "For now, libdecaf only supports 32- and 64-bit architectures."
+# endif

-#if defined(__AVX2__)
-    #define VECTOR_ALIGNED __attribute__((aligned(32)))
-    typedef uint32x8_t big_register_t;
-    typedef uint64x4_t uint64xn_t;
-    typedef uint32x8_t uint32xn_t;
+# ifdef __ARM_NEON__
+typedef uint32x4_t vecmask_t;
+# elif defined(__clang__)
+typedef uint64_t uint64x2_t __attribute__ ((ext_vector_type(2)));
+typedef int64_t int64x2_t __attribute__ ((ext_vector_type(2)));
+typedef uint64_t uint64x4_t __attribute__ ((ext_vector_type(4)));
+typedef int64_t int64x4_t __attribute__ ((ext_vector_type(4)));
+typedef uint32_t uint32x4_t __attribute__ ((ext_vector_type(4)));
+typedef int32_t int32x4_t __attribute__ ((ext_vector_type(4)));
+typedef uint32_t uint32x2_t __attribute__ ((ext_vector_type(2)));
+typedef int32_t int32x2_t __attribute__ ((ext_vector_type(2)));
+typedef uint32_t uint32x8_t __attribute__ ((ext_vector_type(8)));
+typedef int32_t int32x8_t __attribute__ ((ext_vector_type(8)));
+typedef word_t vecmask_t __attribute__ ((ext_vector_type(4)));
+# else                          /* GCC, hopefully? */
+typedef uint64_t uint64x2_t __attribute__ ((vector_size(16)));
+typedef int64_t int64x2_t __attribute__ ((vector_size(16)));
+typedef uint64_t uint64x4_t __attribute__ ((vector_size(32)));
+typedef int64_t int64x4_t __attribute__ ((vector_size(32)));
+typedef uint32_t uint32x4_t __attribute__ ((vector_size(16)));
+typedef int32_t int32x4_t __attribute__ ((vector_size(16)));
+typedef uint32_t uint32x2_t __attribute__ ((vector_size(8)));
+typedef int32_t int32x2_t __attribute__ ((vector_size(8)));
+typedef uint32_t uint32x8_t __attribute__ ((vector_size(32)));
+typedef int32_t int32x8_t __attribute__ ((vector_size(32)));
+typedef word_t vecmask_t __attribute__ ((vector_size(32)));
+# endif

-    static ossl_inline big_register_t
-    br_set_to_mask(mask_t x) {
-        uint32_t y = (uint32_t)x;
-        big_register_t ret = {y,y,y,y,y,y,y,y};
-        return ret;
-    }
-#elif defined(__SSE2__)
-    #define VECTOR_ALIGNED __attribute__((aligned(16)))
-    typedef uint32x4_t big_register_t;
-    typedef uint64x2_t uint64xn_t;
-    typedef uint32x4_t uint32xn_t;
+# if defined(__AVX2__)
+#  define VECTOR_ALIGNED __attribute__((aligned(32)))
+typedef uint32x8_t big_register_t;
+typedef uint64x4_t uint64xn_t;
+typedef uint32x8_t uint32xn_t;

-    static ossl_inline big_register_t
-    br_set_to_mask(mask_t x) {
-        uint32_t y = x;
-        big_register_t ret = {y,y,y,y};
-        return ret;
-    }
-#elif defined(__ARM_NEON__)
-    #define VECTOR_ALIGNED __attribute__((aligned(16)))
-    typedef uint32x4_t big_register_t;
-    typedef uint64x2_t uint64xn_t;
-    typedef uint32x4_t uint32xn_t;
-    
-    static ossl_inline big_register_t
-    br_set_to_mask(mask_t x) {
-        return vdupq_n_u32(x);
-    }
-#elif defined(_WIN64) || defined(__amd64__) || defined(__X86_64__) \
+static ossl_inline big_register_t br_set_to_mask(mask_t x)
+{
+    uint32_t y = (uint32_t)x;
+    big_register_t ret = { y, y, y, y, y, y, y, y };
+    return ret;
+}
+# elif defined(__SSE2__)
+#  define VECTOR_ALIGNED __attribute__((aligned(16)))
+typedef uint32x4_t big_register_t;
+typedef uint64x2_t uint64xn_t;
+typedef uint32x4_t uint32xn_t;
+
+static ossl_inline big_register_t br_set_to_mask(mask_t x)
+{
+    uint32_t y = x;
+    big_register_t ret = { y, y, y, y };
+    return ret;
+}
+# elif defined(__ARM_NEON__)
+#  define VECTOR_ALIGNED __attribute__((aligned(16)))
+typedef uint32x4_t big_register_t;
+typedef uint64x2_t uint64xn_t;
+typedef uint32x4_t uint32xn_t;
+
+static ossl_inline big_register_t br_set_to_mask(mask_t x)
+{
+    return vdupq_n_u32(x);
+}
+# elif defined(_WIN64) || defined(__amd64__) || defined(__X86_64__) \
      || defined(__aarch64__)
-    #define VECTOR_ALIGNED __attribute__((aligned(8)))
-    typedef uint64_t big_register_t, uint64xn_t;
+#  define VECTOR_ALIGNED __attribute__((aligned(8)))
+typedef uint64_t big_register_t, uint64xn_t;

-    typedef uint32_t uint32xn_t;
-    static ossl_inline big_register_t
-    br_set_to_mask(mask_t x) {
-        return (big_register_t)x;
-    }
-#else
-    #define VECTOR_ALIGNED __attribute__((aligned(4)))
-    typedef uint64_t uint64xn_t;
-    typedef uint32_t uint32xn_t;
-    typedef uint32_t big_register_t;
+typedef uint32_t uint32xn_t;
+static ossl_inline big_register_t br_set_to_mask(mask_t x)
+{
+    return (big_register_t) x;
+}
+# else
+#  define VECTOR_ALIGNED __attribute__((aligned(4)))
+typedef uint64_t uint64xn_t;
+typedef uint32_t uint32xn_t;
+typedef uint32_t big_register_t;

-    static ossl_inline big_register_t
-    br_set_to_mask(mask_t x) {
-        return (big_register_t)x;
-    }
-#endif
+static ossl_inline big_register_t br_set_to_mask(mask_t x)
+{
+    return (big_register_t) x;
+}
+# endif

-#if defined(__AVX2__)
-    static ossl_inline big_register_t
-    br_is_zero(big_register_t x) {
-        return (big_register_t)(x == br_set_to_mask(0));
-    }
-#elif defined(__SSE2__)
-    static ossl_inline big_register_t
-    br_is_zero(big_register_t x) {
-        return (big_register_t)_mm_cmpeq_epi32((__m128i)x, _mm_setzero_si128());
-        //return (big_register_t)(x == br_set_to_mask(0));
-    }
-#elif defined(__ARM_NEON__)
-    static ossl_inline big_register_t
-    br_is_zero(big_register_t x) {
-        return vceqq_u32(x,x^x);
-    }
-#else
-    #define br_is_zero word_is_zero
-#endif
+# if defined(__AVX2__)
+static ossl_inline big_register_t br_is_zero(big_register_t x)
+{
+    return (big_register_t) (x == br_set_to_mask(0));
+}
+# elif defined(__SSE2__)
+static ossl_inline big_register_t br_is_zero(big_register_t x)
+{
+    return (big_register_t) _mm_cmpeq_epi32((__m128i) x, _mm_setzero_si128());
+    // return (big_register_t)(x == br_set_to_mask(0));
+}
+# elif defined(__ARM_NEON__)
+static ossl_inline big_register_t br_is_zero(big_register_t x)
+{
+    return vceqq_u32(x, x ^ x);
+}
+# else
+#  define br_is_zero word_is_zero
+# endif

 /* PERF: vectorize vs unroll */
-#ifdef __clang__
-#if 100*__clang_major__ + __clang_minor__ > 305
-#define UNROLL _Pragma("clang loop unroll(full)")
-#endif
-#endif
+# ifdef __clang__
+#  if 100*__clang_major__ + __clang_minor__ > 305
+#   define UNROLL _Pragma("clang loop unroll(full)")
+#  endif
+# endif

-#ifndef UNROLL
-#define UNROLL
-#endif
+# ifndef UNROLL
+#  define UNROLL
+# endif

-/* The plan on booleans:
- *
- * The external interface uses decaf_bool_t, but this might be a different
- * size than our particular arch's word_t (and thus mask_t).  Also, the caller
- * isn't guaranteed to pass it as nonzero.  So bool_to_mask converts word sizes
- * and checks nonzero.
- *
- * On the flip side, mask_t is always -1 or 0, but it might be a different size
- * than decaf_bool_t.
- *
- * On the third hand, we have success vs boolean types, but that's handled in
- * common.h: it converts between decaf_bool_t and decaf_error_t.
+/*
+ * The plan on booleans: The external interface uses decaf_bool_t, but this
+ * might be a different size than our particular arch's word_t (and thus
+ * mask_t).  Also, the caller isn't guaranteed to pass it as nonzero.  So
+ * bool_to_mask converts word sizes and checks nonzero. On the flip side,
+ * mask_t is always -1 or 0, but it might be a different size than
+ * decaf_bool_t. On the third hand, we have success vs boolean types, but
+ * that's handled in common.h: it converts between decaf_bool_t and
+ * decaf_error_t.
 */
-static ossl_inline decaf_bool_t mask_to_bool (mask_t m) {
-    return (decaf_sword_t)(sword_t)m;
+static ossl_inline decaf_bool_t mask_to_bool(mask_t m)
+{
+    return (decaf_sword_t) (sword_t) m;
 }

-static ossl_inline mask_t bool_to_mask (decaf_bool_t m) {
+static ossl_inline mask_t bool_to_mask(decaf_bool_t m)
+{
    /* On most arches this will be optimized to a simple cast. */
    mask_t ret = 0;
    unsigned int i;

-    unsigned int limit = sizeof(decaf_bool_t)/sizeof(mask_t);
-    if (limit < 1) limit = 1;
-    for (i=0; i<limit; i++) {
-        ret |= ~ word_is_zero(m >> (i*8*sizeof(word_t)));
+    unsigned int limit = sizeof(decaf_bool_t) / sizeof(mask_t);
+    if (limit < 1)
+        limit = 1;
+    for (i = 0; i < limit; i++) {
+        ret |= ~word_is_zero(m >> (i * 8 * sizeof(word_t)));
    }
    return ret;
 }

-static ossl_inline void ignore_result ( decaf_bool_t boo ) {
+static ossl_inline void ignore_result(decaf_bool_t boo)
+{
    (void)boo;
 }

-#endif /* __WORD_H__ */
+#endif                          /* __WORD_H__ */