mirror of
git://gcc.gnu.org/git/gcc.git
synced 2025-03-02 09:05:41 +08:00
runtime: add arm64 version of AES hash code
Rewrite the arm64 AES hashing code from gc assembler to C code using intrinsics. The resulting code generates the same hash code for the same input as the gc code--that doesn't matter as such, but testing it ensures that the C code does something useful. Reviewed-on: https://go-review.googlesource.com/138535 From-SVN: r264771
This commit is contained in:
parent
df1346b423
commit
1b28253347
@ -1,4 +1,4 @@
|
||||
f4a224ec481957ca4f14d0e8cc4fe59cc95b3a49
|
||||
013a9e68c9a31f888733d46182d19f9e5d956f27
|
||||
|
||||
The first line of this file holds the git revision number of the last
|
||||
merge done from the gofrontend repository.
|
||||
|
@ -573,13 +573,412 @@ uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
|
||||
|
||||
#endif // !defined(__x86_64__)
|
||||
|
||||
#else // !defined(__i386__) && !defined(__x86_64__) || !defined(HAVE_AS_X86_AES)
|
||||
#elif defined(__aarch64__)
|
||||
|
||||
// Undefine some identifiers that we pick up from the Go runtime package that
|
||||
// are used in arm_neon.h.
|
||||
|
||||
#undef t1
|
||||
#undef tx
|
||||
#undef t2
|
||||
#undef t3
|
||||
#undef t4
|
||||
#undef t5
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
// Force appropriate CPU level. We won't call here unless the CPU
|
||||
// supports it.
|
||||
|
||||
#pragma GCC target("+crypto")
|
||||
|
||||
// The arm64 version of aeshashbody.
|
||||
|
||||
uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
|
||||
uint8x16_t *pseed;
|
||||
uint32x4_t vinit32;
|
||||
uint8x16_t vinit;
|
||||
uint8x16_t vseed, vseed2, vseed3, vseed4;
|
||||
uint8x16_t vseed5, vseed6, vseed7, vseed8;
|
||||
uint8x16_t vval, vval2, vval3, vval4;
|
||||
uint8x16_t vval5, vval6, vval7, vval8;
|
||||
uint8x16_t vvalLoop, vvalLoop2, vvalLoop3, vvalLoop4;
|
||||
uint8x16_t vvalLoop5, vvalLoop6, vvalLoop7, vvalLoop8;
|
||||
uint8x16x2_t avval2;
|
||||
uint8x16x3_t avseed3;
|
||||
|
||||
pseed = (uint8x16_t*)(aeskeysched.__values);
|
||||
|
||||
// Combined hash seed and length.
|
||||
vinit32 = vdupq_n_u32(0);
|
||||
vinit32[0] = (uint32)seed;
|
||||
vinit32[1] = (uint32)size;
|
||||
vinit = vreinterpretq_u8_u32(vinit32);
|
||||
|
||||
// Mix in per-process seed.
|
||||
vseed = vaeseq_u8(*pseed, vinit);
|
||||
++pseed;
|
||||
// Scramble seed.
|
||||
vseed = vaesmcq_u8(vseed);
|
||||
|
||||
if (size <= 16) {
|
||||
if (size == 0) {
|
||||
// Return 64 bits of scrambled input seed.
|
||||
return vreinterpretq_u64_u8(vseed)[0];
|
||||
} else if (size < 16) {
|
||||
vval = vreinterpretq_u8_u32(vdupq_n_u32(0));
|
||||
if ((size & 8) != 0) {
|
||||
vval = vreinterpretq_u8_u64(vld1q_lane_u64((uint64_t*)(p), vreinterpretq_u64_u8(vval), 0));
|
||||
p = (void*)((uint64_t*)(p) + 1);
|
||||
}
|
||||
if ((size & 4) != 0) {
|
||||
vval = vreinterpretq_u8_u32(vld1q_lane_u32((uint32_t*)(p), vreinterpretq_u32_u8(vval), 2));
|
||||
p = (void*)((uint32_t*)(p) + 1);
|
||||
}
|
||||
if ((size & 2) != 0) {
|
||||
vval = vreinterpretq_u8_u16(vld1q_lane_u16((uint16_t*)(p), vreinterpretq_u16_u8(vval), 6));
|
||||
p = (void*)((uint16_t*)(p) + 1);
|
||||
}
|
||||
if ((size & 1) != 0) {
|
||||
vval = vld1q_lane_u8((uint8*)(p), vval, 14);
|
||||
}
|
||||
} else {
|
||||
vval = *(uint8x16_t*)(p);
|
||||
}
|
||||
vval = vaeseq_u8(vval, vseed);
|
||||
vval = vaesmcq_u8(vval);
|
||||
vval = vaeseq_u8(vval, vseed);
|
||||
vval = vaesmcq_u8(vval);
|
||||
vval = vaeseq_u8(vval, vseed);
|
||||
return vreinterpretq_u64_u8(vval)[0];
|
||||
} else if (size <= 32) {
|
||||
// Make a second seed.
|
||||
vseed2 = vaeseq_u8(*pseed, vinit);
|
||||
vseed2 = vaesmcq_u8(vseed2);
|
||||
vval = *(uint8x16_t*)(p);
|
||||
vval2 = *(uint8x16_t*)((char*)(p) + (size - 16));
|
||||
|
||||
vval = vaeseq_u8(vval, vseed);
|
||||
vval = vaesmcq_u8(vval);
|
||||
vval2 = vaeseq_u8(vval2, vseed2);
|
||||
vval2 = vaesmcq_u8(vval2);
|
||||
|
||||
vval = vaeseq_u8(vval, vseed);
|
||||
vval = vaesmcq_u8(vval);
|
||||
vval2 = vaeseq_u8(vval2, vseed2);
|
||||
vval2 = vaesmcq_u8(vval2);
|
||||
|
||||
vval = vaeseq_u8(vval, vseed);
|
||||
vval2 = vaeseq_u8(vval2, vseed2);
|
||||
|
||||
vval ^= vval2;
|
||||
|
||||
return vreinterpretq_u64_u8(vval)[0];
|
||||
} else if (size <= 64) {
|
||||
avseed3 = vld1q_u8_x3((uint8*)(pseed));
|
||||
vseed2 = avseed3.val[0];
|
||||
vseed3 = avseed3.val[1];
|
||||
vseed4 = avseed3.val[2];
|
||||
|
||||
vseed2 = vaeseq_u8(vseed2, vinit);
|
||||
vseed2 = vaesmcq_u8(vseed2);
|
||||
vseed3 = vaeseq_u8(vseed3, vinit);
|
||||
vseed3 = vaesmcq_u8(vseed3);
|
||||
vseed4 = vaeseq_u8(vseed4, vinit);
|
||||
vseed4 = vaesmcq_u8(vseed4);
|
||||
|
||||
avval2 = vld1q_u8_x2((uint8*)(p));
|
||||
vval = avval2.val[0];
|
||||
vval2 = avval2.val[1];
|
||||
avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32));
|
||||
vval3 = avval2.val[0];
|
||||
vval4 = avval2.val[1];
|
||||
|
||||
vval = vaeseq_u8(vval, vseed);
|
||||
vval = vaesmcq_u8(vval);
|
||||
vval2 = vaeseq_u8(vval2, vseed2);
|
||||
vval2 = vaesmcq_u8(vval2);
|
||||
vval3 = vaeseq_u8(vval3, vseed3);
|
||||
vval3 = vaesmcq_u8(vval3);
|
||||
vval4 = vaeseq_u8(vval4, vseed4);
|
||||
vval4 = vaesmcq_u8(vval4);
|
||||
|
||||
vval = vaeseq_u8(vval, vseed);
|
||||
vval = vaesmcq_u8(vval);
|
||||
vval2 = vaeseq_u8(vval2, vseed2);
|
||||
vval2 = vaesmcq_u8(vval2);
|
||||
vval3 = vaeseq_u8(vval3, vseed3);
|
||||
vval3 = vaesmcq_u8(vval3);
|
||||
vval4 = vaeseq_u8(vval4, vseed4);
|
||||
vval4 = vaesmcq_u8(vval4);
|
||||
|
||||
vval = vaeseq_u8(vval, vseed);
|
||||
vval2 = vaeseq_u8(vval2, vseed2);
|
||||
vval3 = vaeseq_u8(vval3, vseed3);
|
||||
vval4 = vaeseq_u8(vval4, vseed4);
|
||||
|
||||
vval ^= vval3;
|
||||
vval2 ^= vval4;
|
||||
vval ^= vval2;
|
||||
|
||||
return vreinterpretq_u64_u8(vval)[0];
|
||||
} else if (size <= 128) {
|
||||
// For some reason vld1q_u8_x4 is missing.
|
||||
avseed3 = vld1q_u8_x3((uint8*)(pseed));
|
||||
vseed2 = avseed3.val[0];
|
||||
vseed3 = avseed3.val[1];
|
||||
vseed4 = avseed3.val[2];
|
||||
avseed3 = vld1q_u8_x3((uint8*)(pseed + 3));
|
||||
vseed5 = avseed3.val[0];
|
||||
vseed6 = avseed3.val[1];
|
||||
vseed7 = avseed3.val[2];
|
||||
vseed8 = *(pseed + 6);
|
||||
|
||||
vseed2 = vaeseq_u8(vseed2, vinit);
|
||||
vseed2 = vaesmcq_u8(vseed2);
|
||||
vseed3 = vaeseq_u8(vseed3, vinit);
|
||||
vseed3 = vaesmcq_u8(vseed3);
|
||||
vseed4 = vaeseq_u8(vseed4, vinit);
|
||||
vseed4 = vaesmcq_u8(vseed4);
|
||||
vseed5 = vaeseq_u8(vseed5, vinit);
|
||||
vseed5 = vaesmcq_u8(vseed5);
|
||||
vseed6 = vaeseq_u8(vseed6, vinit);
|
||||
vseed6 = vaesmcq_u8(vseed6);
|
||||
vseed7 = vaeseq_u8(vseed7, vinit);
|
||||
vseed7 = vaesmcq_u8(vseed7);
|
||||
vseed8 = vaeseq_u8(vseed8, vinit);
|
||||
vseed8 = vaesmcq_u8(vseed8);
|
||||
|
||||
avval2 = vld1q_u8_x2((uint8*)(p));
|
||||
vval = avval2.val[0];
|
||||
vval2 = avval2.val[1];
|
||||
avval2 = vld1q_u8_x2((uint8*)(p) + 32);
|
||||
vval3 = avval2.val[0];
|
||||
vval4 = avval2.val[1];
|
||||
avval2 = vld1q_u8_x2((uint8*)(p) + (size - 64));
|
||||
vval5 = avval2.val[0];
|
||||
vval6 = avval2.val[1];
|
||||
avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32));
|
||||
vval7 = avval2.val[0];
|
||||
vval8 = avval2.val[1];
|
||||
|
||||
vval = vaeseq_u8(vval, vseed);
|
||||
vval = vaesmcq_u8(vval);
|
||||
vval2 = vaeseq_u8(vval2, vseed2);
|
||||
vval2 = vaesmcq_u8(vval2);
|
||||
vval3 = vaeseq_u8(vval3, vseed3);
|
||||
vval3 = vaesmcq_u8(vval3);
|
||||
vval4 = vaeseq_u8(vval4, vseed4);
|
||||
vval4 = vaesmcq_u8(vval4);
|
||||
vval5 = vaeseq_u8(vval5, vseed5);
|
||||
vval5 = vaesmcq_u8(vval5);
|
||||
vval6 = vaeseq_u8(vval6, vseed6);
|
||||
vval6 = vaesmcq_u8(vval6);
|
||||
vval7 = vaeseq_u8(vval7, vseed7);
|
||||
vval7 = vaesmcq_u8(vval7);
|
||||
vval8 = vaeseq_u8(vval8, vseed8);
|
||||
vval8 = vaesmcq_u8(vval8);
|
||||
|
||||
vval = vaeseq_u8(vval, vseed);
|
||||
vval = vaesmcq_u8(vval);
|
||||
vval2 = vaeseq_u8(vval2, vseed2);
|
||||
vval2 = vaesmcq_u8(vval2);
|
||||
vval3 = vaeseq_u8(vval3, vseed3);
|
||||
vval3 = vaesmcq_u8(vval3);
|
||||
vval4 = vaeseq_u8(vval4, vseed4);
|
||||
vval4 = vaesmcq_u8(vval4);
|
||||
vval5 = vaeseq_u8(vval5, vseed5);
|
||||
vval5 = vaesmcq_u8(vval5);
|
||||
vval6 = vaeseq_u8(vval6, vseed6);
|
||||
vval6 = vaesmcq_u8(vval6);
|
||||
vval7 = vaeseq_u8(vval7, vseed7);
|
||||
vval7 = vaesmcq_u8(vval7);
|
||||
vval8 = vaeseq_u8(vval8, vseed8);
|
||||
vval8 = vaesmcq_u8(vval8);
|
||||
|
||||
vval = vaeseq_u8(vval, vseed);
|
||||
vval2 = vaeseq_u8(vval2, vseed2);
|
||||
vval3 = vaeseq_u8(vval3, vseed3);
|
||||
vval4 = vaeseq_u8(vval4, vseed4);
|
||||
vval5 = vaeseq_u8(vval5, vseed5);
|
||||
vval6 = vaeseq_u8(vval6, vseed6);
|
||||
vval7 = vaeseq_u8(vval7, vseed7);
|
||||
vval8 = vaeseq_u8(vval8, vseed8);
|
||||
|
||||
vval ^= vval5;
|
||||
vval2 ^= vval6;
|
||||
vval3 ^= vval7;
|
||||
vval4 ^= vval8;
|
||||
vval ^= vval3;
|
||||
vval2 ^= vval4;
|
||||
vval ^= vval2;
|
||||
|
||||
return vreinterpretq_u64_u8(vval)[0];
|
||||
} else {
|
||||
// For some reason vld1q_u8_x4 is missing.
|
||||
avseed3 = vld1q_u8_x3((uint8*)(pseed));
|
||||
vseed2 = avseed3.val[0];
|
||||
vseed3 = avseed3.val[1];
|
||||
vseed4 = avseed3.val[2];
|
||||
avseed3 = vld1q_u8_x3((uint8*)(pseed + 3));
|
||||
vseed5 = avseed3.val[0];
|
||||
vseed6 = avseed3.val[1];
|
||||
vseed7 = avseed3.val[2];
|
||||
vseed8 = *(pseed + 6);
|
||||
|
||||
vseed2 = vaeseq_u8(vseed2, vinit);
|
||||
vseed2 = vaesmcq_u8(vseed2);
|
||||
vseed3 = vaeseq_u8(vseed3, vinit);
|
||||
vseed3 = vaesmcq_u8(vseed3);
|
||||
vseed4 = vaeseq_u8(vseed4, vinit);
|
||||
vseed4 = vaesmcq_u8(vseed4);
|
||||
vseed5 = vaeseq_u8(vseed5, vinit);
|
||||
vseed5 = vaesmcq_u8(vseed5);
|
||||
vseed6 = vaeseq_u8(vseed6, vinit);
|
||||
vseed6 = vaesmcq_u8(vseed6);
|
||||
vseed7 = vaeseq_u8(vseed7, vinit);
|
||||
vseed7 = vaesmcq_u8(vseed7);
|
||||
vseed8 = vaeseq_u8(vseed8, vinit);
|
||||
vseed8 = vaesmcq_u8(vseed8);
|
||||
|
||||
avval2 = vld1q_u8_x2((uint8*)(p) + (size - 128));
|
||||
vval = avval2.val[0];
|
||||
vval2 = avval2.val[1];
|
||||
avval2 = vld1q_u8_x2((uint8*)(p) + (size - 96));
|
||||
vval3 = avval2.val[0];
|
||||
vval4 = avval2.val[1];
|
||||
avval2 = vld1q_u8_x2((uint8*)(p) + (size - 64));
|
||||
vval5 = avval2.val[0];
|
||||
vval6 = avval2.val[1];
|
||||
avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32));
|
||||
vval7 = avval2.val[0];
|
||||
vval8 = avval2.val[1];
|
||||
|
||||
vvalLoop = vseed;
|
||||
vvalLoop2 = vseed2;
|
||||
vvalLoop3 = vseed3;
|
||||
vvalLoop4 = vseed4;
|
||||
vvalLoop5 = vseed5;
|
||||
vvalLoop6 = vseed6;
|
||||
vvalLoop7 = vseed7;
|
||||
vvalLoop8 = vseed8;
|
||||
|
||||
size--;
|
||||
size >>= 7;
|
||||
do {
|
||||
vval = vaeseq_u8(vval, vvalLoop);
|
||||
vval = vaesmcq_u8(vval);
|
||||
vval2 = vaeseq_u8(vval2, vvalLoop2);
|
||||
vval2 = vaesmcq_u8(vval2);
|
||||
vval3 = vaeseq_u8(vval3, vvalLoop3);
|
||||
vval3 = vaesmcq_u8(vval3);
|
||||
vval4 = vaeseq_u8(vval4, vvalLoop4);
|
||||
vval4 = vaesmcq_u8(vval4);
|
||||
vval5 = vaeseq_u8(vval5, vvalLoop5);
|
||||
vval5 = vaesmcq_u8(vval5);
|
||||
vval6 = vaeseq_u8(vval6, vvalLoop6);
|
||||
vval6 = vaesmcq_u8(vval6);
|
||||
vval7 = vaeseq_u8(vval7, vvalLoop7);
|
||||
vval7 = vaesmcq_u8(vval7);
|
||||
vval8 = vaeseq_u8(vval8, vvalLoop8);
|
||||
vval8 = vaesmcq_u8(vval8);
|
||||
|
||||
avval2 = vld1q_u8_x2((uint8*)(p));
|
||||
vvalLoop = avval2.val[0];
|
||||
vvalLoop2 = avval2.val[1];
|
||||
avval2 = vld1q_u8_x2((uint8*)(p) + 32);
|
||||
vvalLoop3 = avval2.val[0];
|
||||
vvalLoop4 = avval2.val[1];
|
||||
avval2 = vld1q_u8_x2((uint8*)(p) + 64);
|
||||
vvalLoop5 = avval2.val[0];
|
||||
vvalLoop6 = avval2.val[1];
|
||||
avval2 = vld1q_u8_x2((uint8*)(p) + 96);
|
||||
vvalLoop7 = avval2.val[0];
|
||||
vvalLoop8 = avval2.val[1];
|
||||
|
||||
p = (void *)((uint8*)(p) + 128);
|
||||
|
||||
vval = vaeseq_u8(vval, vvalLoop);
|
||||
vval = vaesmcq_u8(vval);
|
||||
vval2 = vaeseq_u8(vval2, vvalLoop2);
|
||||
vval2 = vaesmcq_u8(vval2);
|
||||
vval3 = vaeseq_u8(vval3, vvalLoop3);
|
||||
vval3 = vaesmcq_u8(vval3);
|
||||
vval4 = vaeseq_u8(vval4, vvalLoop4);
|
||||
vval4 = vaesmcq_u8(vval4);
|
||||
vval5 = vaeseq_u8(vval5, vvalLoop5);
|
||||
vval5 = vaesmcq_u8(vval5);
|
||||
vval6 = vaeseq_u8(vval6, vvalLoop6);
|
||||
vval6 = vaesmcq_u8(vval6);
|
||||
vval7 = vaeseq_u8(vval7, vvalLoop7);
|
||||
vval7 = vaesmcq_u8(vval7);
|
||||
vval8 = vaeseq_u8(vval8, vvalLoop8);
|
||||
vval8 = vaesmcq_u8(vval8);
|
||||
} while (--size > 0);
|
||||
|
||||
vval = vaeseq_u8(vval, vvalLoop);
|
||||
vval = vaesmcq_u8(vval);
|
||||
vval2 = vaeseq_u8(vval2, vvalLoop2);
|
||||
vval2 = vaesmcq_u8(vval2);
|
||||
vval3 = vaeseq_u8(vval3, vvalLoop3);
|
||||
vval3 = vaesmcq_u8(vval3);
|
||||
vval4 = vaeseq_u8(vval4, vvalLoop4);
|
||||
vval4 = vaesmcq_u8(vval4);
|
||||
vval5 = vaeseq_u8(vval5, vvalLoop5);
|
||||
vval5 = vaesmcq_u8(vval5);
|
||||
vval6 = vaeseq_u8(vval6, vvalLoop6);
|
||||
vval6 = vaesmcq_u8(vval6);
|
||||
vval7 = vaeseq_u8(vval7, vvalLoop7);
|
||||
vval7 = vaesmcq_u8(vval7);
|
||||
vval8 = vaeseq_u8(vval8, vvalLoop8);
|
||||
vval8 = vaesmcq_u8(vval8);
|
||||
|
||||
|
||||
vval = vaeseq_u8(vval, vvalLoop);
|
||||
vval = vaesmcq_u8(vval);
|
||||
vval2 = vaeseq_u8(vval2, vvalLoop2);
|
||||
vval2 = vaesmcq_u8(vval2);
|
||||
vval3 = vaeseq_u8(vval3, vvalLoop3);
|
||||
vval3 = vaesmcq_u8(vval3);
|
||||
vval4 = vaeseq_u8(vval4, vvalLoop4);
|
||||
vval4 = vaesmcq_u8(vval4);
|
||||
vval5 = vaeseq_u8(vval5, vvalLoop5);
|
||||
vval5 = vaesmcq_u8(vval5);
|
||||
vval6 = vaeseq_u8(vval6, vvalLoop6);
|
||||
vval6 = vaesmcq_u8(vval6);
|
||||
vval7 = vaeseq_u8(vval7, vvalLoop7);
|
||||
vval7 = vaesmcq_u8(vval7);
|
||||
vval8 = vaeseq_u8(vval8, vvalLoop8);
|
||||
vval8 = vaesmcq_u8(vval8);
|
||||
|
||||
vval = vaeseq_u8(vval, vvalLoop);
|
||||
vval2 = vaeseq_u8(vval2, vvalLoop2);
|
||||
vval3 = vaeseq_u8(vval3, vvalLoop3);
|
||||
vval4 = vaeseq_u8(vval4, vvalLoop4);
|
||||
vval5 = vaeseq_u8(vval5, vvalLoop5);
|
||||
vval6 = vaeseq_u8(vval6, vvalLoop6);
|
||||
vval7 = vaeseq_u8(vval7, vvalLoop7);
|
||||
vval8 = vaeseq_u8(vval8, vvalLoop8);
|
||||
|
||||
vval ^= vval5;
|
||||
vval2 ^= vval6;
|
||||
vval3 ^= vval7;
|
||||
vval4 ^= vval8;
|
||||
vval ^= vval3;
|
||||
vval2 ^= vval4;
|
||||
vval ^= vval2;
|
||||
|
||||
return vreinterpretq_u64_u8(vval)[0];
|
||||
}
|
||||
}
|
||||
|
||||
#else // (!defined(__i386__) && !defined(__x86_64__) || !defined(HAVE_AS_X86_AES)) && !defined(__aarch64__)
|
||||
|
||||
uintptr aeshashbody(void* p __attribute__((unused)),
|
||||
uintptr seed __attribute__((unused)),
|
||||
uintptr size __attribute__((unused)),
|
||||
Slice aeskeysched __attribute__((unused))) {
|
||||
// We should never get here on a non-x86 system.
|
||||
// We should never get here on a non-x86, non-arm64 system.
|
||||
runtime_throw("impossible call to aeshashbody");
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user