diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 9e66575a9..9fbb256a1 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -551,7 +551,7 @@ EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet16i& from) { template <> EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, Index stride) { - Packet16i stride_vector = _mm512_set1_epi32(stride); + Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); @@ -561,7 +561,7 @@ EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, template <> EIGEN_DEVICE_FUNC inline Packet8d pgather(const double* from, Index stride) { - Packet8i stride_vector = _mm256_set1_epi32(stride); + Packet8i stride_vector = _mm256_set1_epi32(convert_index(stride)); Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); @@ -572,7 +572,7 @@ template <> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet16f& from, Index stride) { - Packet16i stride_vector = _mm512_set1_epi32(stride); + Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); @@ -582,7 +582,7 @@ template <> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet8d& from, Index stride) { - Packet8i stride_vector = _mm256_set1_epi32(stride); + Packet8i stride_vector = _mm256_set1_epi32(convert_index(stride)); Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); _mm512_i32scatter_pd(to, indices, from, 8); @@ -660,8 +660,7 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { #ifdef EIGEN_VECTORIZE_AVX512DQ #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ - OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \ - OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1); + OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1); #else #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \ @@ -855,7 +854,7 @@ template<> EIGEN_STRONG_INLINE Packet8d preduxp(const Packet8d* vecs) final_1 = _mm256_add_pd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC)); - __m512d final_output = _mm512_insertf64x4(final_output, final_0, 0); + __m512d final_output = _mm512_castpd256_pd512(final_0); return _mm512_insertf64x4(final_output, final_1, 1); } diff --git a/Eigen/src/Core/arch/GPU/PacketMathHalf.h b/Eigen/src/Core/arch/GPU/PacketMathHalf.h index c4feda87d..8787adcde 100644 --- a/Eigen/src/Core/arch/GPU/PacketMathHalf.h +++ b/Eigen/src/Core/arch/GPU/PacketMathHalf.h @@ -542,11 +542,15 @@ template<> EIGEN_STRONG_INLINE Packet16h ploadu(const Eigen::half* fr } template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet16h& from) { - _mm256_store_si256((__m256i*)to, from.x); + // (void*) -> workaround clang warning: + // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 + _mm256_store_si256((__m256i*)(void*)to, from.x); } template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet16h& from) { - _mm256_storeu_si256((__m256i*)to, from.x); + // (void*) -> workaround clang warning: + // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 + _mm256_storeu_si256((__m256i*)(void*)to, from.x); } template<> EIGEN_STRONG_INLINE Packet16h diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 3ec8eb082..b012691c1 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -390,6 +390,7 @@ public: typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; + typedef LhsPacket LhsPacket4Packing; typedef ResPacket AccPacket; @@ -496,6 +497,7 @@ public: typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; + typedef LhsPacket LhsPacket4Packing; typedef ResPacket AccPacket; @@ -626,6 +628,7 @@ public: typedef typename packet_traits::type ScalarPacket; typedef DoublePacket DoublePacketType; + typedef typename conditional::type LhsPacket4Packing; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; @@ -777,6 +780,7 @@ public: typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; + typedef LhsPacket LhsPacket4Packing; typedef ResPacket AccPacket; @@ -1025,9 +1029,9 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r0.loadPacket(2 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r0.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); @@ -1035,9 +1039,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r1.template loadPacket(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C1, alphav, R0); traits.acc(C5, alphav, R1); traits.acc(C9, alphav, R2); @@ -1045,9 +1049,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket(1 * Traits::ResPacketSize); + R2 = r2.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C10, alphav, R2); @@ -1055,9 +1059,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r3.template loadPacket(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C3, alphav, R0); traits.acc(C7, alphav, R1); traits.acc(C11, alphav, R2); @@ -1134,9 +1138,9 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r0.loadPacket(2 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r0.template loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); @@ -1244,10 +1248,10 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); - R2 = r1.loadPacket(0 * Traits::ResPacketSize); - R3 = r1.loadPacket(1 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R2 = r1.template loadPacket(0 * Traits::ResPacketSize); + R3 = r1.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C1, alphav, R2); @@ -1257,10 +1261,10 @@ void gebp_kernel(0 * Traits::ResPacketSize); + R1 = r2.template loadPacket(1 * Traits::ResPacketSize); + R2 = r3.template loadPacket(0 * Traits::ResPacketSize); + R3 = r3.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C3, alphav, R2); @@ -1337,8 +1341,8 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r0.template loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); r0.storePacket(0 * Traits::ResPacketSize, R0); @@ -1431,15 +1435,15 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); - R1 = r1.loadPacket(0 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); + R1 = r1.template loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C1, alphav, R1); r0.storePacket(0 * Traits::ResPacketSize, R0); r1.storePacket(0 * Traits::ResPacketSize, R1); - R0 = r2.loadPacket(0 * Traits::ResPacketSize); - R1 = r3.loadPacket(0 * Traits::ResPacketSize); + R0 = r2.template loadPacket(0 * Traits::ResPacketSize); + R1 = r3.template loadPacket(0 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C3, alphav, R1); r2.storePacket(0 * Traits::ResPacketSize, R0); @@ -1504,7 +1508,7 @@ void gebp_kernel(alpha); - R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R0 = r0.template loadPacket(0 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); r0.storePacket(0 * Traits::ResPacketSize, R0); } @@ -1685,19 +1689,18 @@ void gebp_kernel -struct gemm_pack_lhs +template +struct gemm_pack_lhs { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs +template +EIGEN_DONT_INLINE void gemm_pack_lhs ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - typedef typename packet_traits::type Packet; - enum { PacketSize = packet_traits::size }; + enum { PacketSize = unpacket_traits::size }; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1725,9 +1728,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); + B = lhs.template loadPacket(i+1*PacketSize, k); + C = lhs.template loadPacket(i+2*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; pstore(blockA+count, cj.pconj(C)); count+=PacketSize; @@ -1745,8 +1748,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); + B = lhs.template loadPacket(i+1*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } @@ -1763,7 +1766,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } @@ -1793,19 +1796,18 @@ EIGEN_DONT_INLINE void gemm_pack_lhs -struct gemm_pack_lhs +template +struct gemm_pack_lhs { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs +template +EIGEN_DONT_INLINE void gemm_pack_lhs ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - typedef typename packet_traits::type Packet; - enum { PacketSize = packet_traits::size }; + enum { PacketSize = unpacket_traits::size }; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -1837,7 +1839,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs kernel; - for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k); + for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); ptranspose(kernel); for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); } @@ -1971,10 +1973,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs kernel; - kernel.packet[0] = dm0.loadPacket(k); - kernel.packet[1%PacketSize] = dm1.loadPacket(k); - kernel.packet[2%PacketSize] = dm2.loadPacket(k); - kernel.packet[3%PacketSize] = dm3.loadPacket(k); + kernel.packet[0 ] = dm0.template loadPacket(k); + kernel.packet[1%PacketSize] = dm1.template loadPacket(k); + kernel.packet[2%PacketSize] = dm2.template loadPacket(k); + kernel.packet[3%PacketSize] = dm3.template loadPacket(k); ptranspose(kernel); pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize])); @@ -2075,7 +2077,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs(k, j2); pstoreu(blockB+count, cj.pconj(A)); count += PacketSize; } else { diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index bd7b6ff2a..f49abcad5 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -75,7 +75,7 @@ static void run(Index rows, Index cols, Index depth, Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index e436c50a4..ec2825bf0 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -84,7 +84,7 @@ struct general_matrix_matrix_triangular_product pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; tribb_kernel sybb; @@ -110,7 +110,6 @@ struct general_matrix_matrix_triangular_product gebp_kernel; symm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; - gemm_pack_lhs pack_lhs_transposed; + gemm_pack_lhs pack_lhs_transposed; for(Index k2=0; k2() + gemm_pack_lhs() (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); @@ -437,7 +437,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; symm_pack_rhs pack_rhs; for(Index k2=0; k2 gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; for(Index k2=IsLower ? depth : 0; @@ -222,7 +222,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix() + gemm_pack_lhs() (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, @@ -299,7 +299,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; diff --git a/Eigen/src/Core/products/TriangularSolverMatrix.h b/Eigen/src/Core/products/TriangularSolverMatrix.h index 223c38b86..8ff2e9d9d 100644 --- a/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -76,7 +76,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; gebp_kernel gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; // the goal here is to subdivise the Rhs panels such that we keep some cache @@ -229,7 +229,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix gebp_kernel; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; - gemm_pack_lhs pack_lhs_panel; + gemm_pack_lhs pack_lhs_panel; for(Index k2=IsLower ? size : 0; IsLower ? k2>0 : k2 struct gemm_pack_rhs; -template +template struct gemm_pack_lhs; template< @@ -156,11 +156,9 @@ class BlasVectorMapper { }; template -class BlasLinearMapper { - public: - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; - +class BlasLinearMapper +{ +public: EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { @@ -171,29 +169,25 @@ class BlasLinearMapper { return m_data[i]; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { - return ploadt(m_data + i); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const { + return ploadt(m_data + i); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { - return ploadt(m_data + i); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const { + pstoret(m_data + i, p); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const { - pstoret(m_data + i, p); - } - - protected: +protected: Scalar *m_data; }; // Lightweight helper class to access matrix coefficients. template -class blas_data_mapper { - public: - typedef typename packet_traits::type Packet; - typedef typename packet_traits::half HalfPacket; - +class blas_data_mapper +{ +public: typedef BlasLinearMapper LinearMapper; typedef BlasVectorMapper VectorMapper; @@ -218,8 +212,9 @@ class blas_data_mapper { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { - return ploadt(&operator()(i, j)); + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const { + return ploadt(&operator()(i, j)); } template @@ -227,10 +222,6 @@ class blas_data_mapper { return ploadt(&operator()(i, j)); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { - return ploadt(&operator()(i, j)); - } - template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { pscatter(&operator()(i, j), p, m_stride); @@ -251,7 +242,7 @@ class blas_data_mapper { return internal::first_default_aligned(m_data, size); } - protected: +protected: Scalar* EIGEN_RESTRICT m_data; const Index m_stride; }; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b4730cff0..c215bfbb2 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -198,7 +198,7 @@ ei_add_test(smallvectors) ei_add_test(mapped_matrix) ei_add_test(mapstride) ei_add_test(mapstaticmethods) -ei_add_test(array) +ei_add_test(array_cwise) ei_add_test(array_for_matrix) ei_add_test(array_replicate) ei_add_test(array_reverse) diff --git a/test/array.cpp b/test/array_cwise.cpp similarity index 99% rename from test/array.cpp rename to test/array_cwise.cpp index d9c4626c0..84e46665b 100644 --- a/test/array.cpp +++ b/test/array_cwise.cpp @@ -453,7 +453,7 @@ template void min_max(const ArrayType& m) } -EIGEN_DECLARE_TEST(array) +EIGEN_DECLARE_TEST(array_cwise) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( array(Array()) ); diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 58a1c60bf..2b0dda573 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -148,24 +148,25 @@ template void packetmath() for (int offset=0; offset(data1); packets[1] = internal::pload(data1+PacketSize); if (offset==0) internal::palign<0>(packets[0], packets[1]); - else if (offset==1) internal::palign<1>(packets[0], packets[1]); - else if (offset==2) internal::palign<2>(packets[0], packets[1]); - else if (offset==3) internal::palign<3>(packets[0], packets[1]); - else if (offset==4) internal::palign<4>(packets[0], packets[1]); - else if (offset==5) internal::palign<5>(packets[0], packets[1]); - else if (offset==6) internal::palign<6>(packets[0], packets[1]); - else if (offset==7) internal::palign<7>(packets[0], packets[1]); - else if (offset==8) internal::palign<8>(packets[0], packets[1]); - else if (offset==9) internal::palign<9>(packets[0], packets[1]); - else if (offset==10) internal::palign<10>(packets[0], packets[1]); - else if (offset==11) internal::palign<11>(packets[0], packets[1]); - else if (offset==12) internal::palign<12>(packets[0], packets[1]); - else if (offset==13) internal::palign<13>(packets[0], packets[1]); - else if (offset==14) internal::palign<14>(packets[0], packets[1]); - else if (offset==15) internal::palign<15>(packets[0], packets[1]); + else if (offset==1) internal::palign(packets[0], packets[1]); + else if (offset==2) internal::palign(packets[0], packets[1]); + else if (offset==3) internal::palign(packets[0], packets[1]); + else if (offset==4) internal::palign(packets[0], packets[1]); + else if (offset==5) internal::palign(packets[0], packets[1]); + else if (offset==6) internal::palign(packets[0], packets[1]); + else if (offset==7) internal::palign(packets[0], packets[1]); + else if (offset==8) internal::palign(packets[0], packets[1]); + else if (offset==9) internal::palign(packets[0], packets[1]); + else if (offset==10) internal::palign(packets[0], packets[1]); + else if (offset==11) internal::palign(packets[0], packets[1]); + else if (offset==12) internal::palign(packets[0], packets[1]); + else if (offset==13) internal::palign(packets[0], packets[1]); + else if (offset==14) internal::palign(packets[0], packets[1]); + else if (offset==15) internal::palign(packets[0], packets[1]); internal::pstore(data2, packets[0]); for (int i=0; i +// Disable "ignoring attributes on template argument" +// for packet_traits +// => The only workaround would be to wrap _m128 and the likes +// within wrappers. +#if EIGEN_GNUC_AT_LEAST(6,0) + #pragma GCC diagnostic ignored "-Wignored-attributes" +#endif + using internal::demangle_flags; using internal::demangle_traversal; using internal::demangle_unrolling; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index f0f61fade..3b22e43e7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -620,7 +620,7 @@ struct TensorContractionEvaluatorBase typedef internal::blas_data_mapper OutputMapper; // Declare GEBP packing and kernel structs - internal::gemm_pack_lhs pack_lhs; + internal::gemm_pack_lhs pack_lhs; internal::gemm_pack_rhs pack_rhs; internal::gebp_kernel gebp; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h index b5e186d21..056665749 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h @@ -549,12 +549,12 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh #define prefetch_lhs(reg, row, col) \ if (!CHECK_LHS_BOUNDARY) { \ if (col < k_size) { \ - reg =lhs.template loadPacket(row, col); \ + reg =lhs.template loadPacket(row, col); \ } \ } else { \ if (col < k_size) { \ if (row + 3 < m_size) { \ - reg =lhs.template loadPacket(row, col); \ + reg =lhs.template loadPacket(row, col); \ } else if (row + 2 < m_size) { \ reg.x =lhs(row + 0, col); \ reg.y =lhs(row + 1, col); \ @@ -584,7 +584,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh if (!CHECK_RHS_BOUNDARY) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -599,7 +599,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh } else { if (rhs_horiz0 < n_size) { if ((rhs_vert + 3) < k_size) { - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); } else if ((rhs_vert + 2) < k_size) { rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); @@ -799,37 +799,37 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (!CHECK_LHS_BOUNDARY) { if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); } } else { // just CHECK_LHS_BOUNDARY if (lhs_vert + 3 < m_size) { if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); } } else if (lhs_vert + 2 < m_size) { if ((threadIdx.y/4+k+24) < k_size) { @@ -918,8 +918,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (!CHECK_RHS_BOUNDARY) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -941,8 +941,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, if (rhs_horiz1 < n_size) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); } else if (rhs_vert + 2 < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); @@ -963,7 +963,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, } else if (rhs_horiz0 < n_size) { if ((rhs_vert + 3) < k_size) { // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); + rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); } else if ((rhs_vert + 2) < k_size) { // just CHECK_RHS_BOUNDARY rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index ab320a50d..dbb0f76bb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -238,9 +238,6 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper::half HalfPacket; - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { @@ -284,27 +281,10 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper(data); } - template + template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { - return this->load(i,j); - } - - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { - // whole method makes column major assumption - - // don't need to add offsets for now (because operator handles that) - const Index half_packet_size = unpacket_traits::size; - if (half_packet_size == packet_size) { - return loadPacket(i, j); - } - EIGEN_ALIGN_MAX Scalar data[half_packet_size]; - for (Index k = 0; k < half_packet_size; k++) { - data[k] = operator()(i + k, j); - } - return pload(data); + EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { + return this->load(i,j); } }; @@ -314,7 +294,8 @@ template class MakePointer_> -class BaseTensorContractionMapper : public SimpleTensorContractionMapper +class BaseTensorContractionMapper + : public SimpleTensorContractionMapper { public: typedef SimpleTensorContractionMapper ParentMapper; @@ -327,12 +308,11 @@ class BaseTensorContractionMapper EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { EIGEN_ALIGN_MAX Scalar data[1]; data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); - return pload(data); + return pload(data); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { @@ -340,10 +320,6 @@ class BaseTensorContractionMapperm_tensor.coeff(this->computeIndex(i, j)); return pload(data); } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { - return loadPacket(i, j); - } }; @@ -354,8 +330,6 @@ template class MakePointer_=MakePointer> class TensorContractionSubMapper { public: - typedef typename Tensor::PacketReturnType Packet; - typedef typename unpacket_traits::half HalfPacket; typedef BaseTensorContractionMapper ParentMapper; typedef TensorContractionSubMapper Self; @@ -390,17 +364,20 @@ class TensorContractionSubMapper { return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const { if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, 0); + return m_base_mapper.template loadPacket(i, 0); } - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, j); + return m_base_mapper.template loadPacket(i, j); } - return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); } template @@ -411,14 +388,8 @@ class TensorContractionSubMapper { return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { - if (UseDirectOffsets) { - return m_base_mapper.template loadHalfPacket(i, 0); - } - return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); - } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet& p) const { + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const { if (UseDirectOffsets) { m_base_mapper.storePacket(i, 0, p); } @@ -434,15 +405,15 @@ class TensorContractionSubMapper { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { - EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, 0); + return m_base_mapper.template loadPacket(i, 0); } - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); } - template + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { return false; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 1d145c4b1..0980854b4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -244,7 +244,7 @@ struct TensorEvaluator + Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> LhsPacker; typedef internal::gemm_pack_rhs< RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h index d5c000e08..39c255791 100644 --- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h +++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h @@ -21,9 +21,9 @@ namespace Eigen { template class array { public: EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; } + EIGEN_STRONG_INLINE T& operator[] (size_t index) { eigen_internal_assert(index < size()); return values[index]; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; } + EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { eigen_internal_assert(index < size()); return values[index]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& at(size_t index) { eigen_assert(index < size()); return values[index]; } diff --git a/unsupported/Eigen/MPRealSupport b/unsupported/Eigen/MPRealSupport index 6392bea91..c4ea4ec5f 100644 --- a/unsupported/Eigen/MPRealSupport +++ b/unsupported/Eigen/MPRealSupport @@ -162,6 +162,7 @@ int main() typedef ResScalar LhsPacket; typedef ResScalar RhsPacket; typedef ResScalar ResPacket; + typedef LhsPacket LhsPacket4Packing; }; diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index 062dd1c0f..2ec85d2d4 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -83,10 +83,10 @@ static void test_expr_shuffling() Tensor result(5, 7, 3, 2); - array src_slice_dim({2, 3, 1, 7}); - array src_slice_start({0, 0, 0, 0}); - array dst_slice_dim({1, 7, 3, 2}); - array dst_slice_start({0, 0, 0, 0}); + array src_slice_dim{{2, 3, 1, 7}}; + array src_slice_start{{0, 0, 0, 0}}; + array dst_slice_dim{{1, 7, 3, 2}}; + array dst_slice_start{{0, 0, 0, 0}}; for (int i = 0; i < 5; ++i) { result.slice(dst_slice_start, dst_slice_dim) =