From 58fe2fc2b21401365ace575738d878dad21eb184 Mon Sep 17 00:00:00 2001 From: Gael Guennebaud Date: Thu, 27 Mar 2014 23:38:50 +0100 Subject: [PATCH] enforce the use of vfmadd231ps for pmadd (gcc and clang stupidely generates the other fmadd variants plus some register moves...) --- Eigen/src/Core/arch/AVX/PacketMath.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 96a4bc08c..dceddb518 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -124,7 +124,19 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, co } #ifdef EIGEN_VECTORIZE_FMA -template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { return _mm256_fmadd_ps(a,b,c); } +template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { +#if defined(__clang__) || defined(__GNUC__) + // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, + // and gcc stupidly generates a vfmadd132ps instruction, + // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate + // the result of the product. + Packet8f res = c; + asm("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); + return res; +#else + return _mm256_fmadd_ps(a,b,c); +#endif +} template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { return _mm256_fmadd_pd(a,b,c); } #endif