From 8d1302f566f70c29d0ba864972ab50460a96cf2d Mon Sep 17 00:00:00 2001 From: Kan Chen Date: Thu, 28 May 2020 20:26:09 +0800 Subject: [PATCH] Add support for PacketBlock and PacketBlock ptranspose on NEON --- Eigen/src/Core/arch/NEON/PacketMath.h | 29 +++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index e11af1dca..065c8100f 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -2869,6 +2869,35 @@ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) kernel.packet[2] = vreinterpret_s16_u32(zip32_2.val[0]); kernel.packet[3] = vreinterpret_s16_u32(zip32_2.val[1]); } + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) +{ + const int16x8x2_t zip16_1 = vzipq_s16(kernel.packet[0], kernel.packet[1]); + const int16x8x2_t zip16_2 = vzipq_s16(kernel.packet[2], kernel.packet[3]); + + const uint32x4x2_t zip32_1 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[0]), vreinterpretq_u32_s16(zip16_2.val[0])); + const uint32x4x2_t zip32_2 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[1]), vreinterpretq_u32_s16(zip16_2.val[1])); + + kernel.packet[0] = vreinterpretq_s16_u32(zip32_1.val[0]); + kernel.packet[1] = vreinterpretq_s16_u32(zip32_1.val[1]); + kernel.packet[2] = vreinterpretq_s16_u32(zip32_2.val[0]); + kernel.packet[3] = vreinterpretq_s16_u32(zip32_2.val[1]); +} + +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) +{ + const uint8x16x2_t zip8_1 = vzipq_u8(kernel.packet[0], kernel.packet[1]); + const uint8x16x2_t zip8_2 = vzipq_u8(kernel.packet[2], kernel.packet[3]); + + const uint16x8x2_t zip16_1 = vzipq_u16(vreinterpretq_u16_u8(zip8_1.val[0]), vreinterpretq_u16_u8(zip8_2.val[0])); + const uint16x8x2_t zip16_2 = vzipq_u16(vreinterpretq_u16_u8(zip8_1.val[1]), vreinterpretq_u16_u8(zip8_2.val[1])); + + kernel.packet[0] = vreinterpretq_u8_u16(zip16_1.val[0]); + kernel.packet[1] = vreinterpretq_u8_u16(zip16_1.val[1]); + kernel.packet[2] = vreinterpretq_u8_u16(zip16_2.val[0]); + kernel.packet[3] = vreinterpretq_u8_u16(zip16_2.val[1]); +} + EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { const int16x8x2_t zip16_1 = vzipq_s16(kernel.packet[0], kernel.packet[1]);