Merged eigen/eigen into default

This commit is contained in:
Rasmus Munk Larsen 2018-09-20 11:41:15 -07:00
commit 8e2be7777e
20 changed files with 185 additions and 208 deletions

View File

@ -551,7 +551,7 @@ EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
template <>
EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
Index stride) {
Packet16i stride_vector = _mm512_set1_epi32(stride);
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
Packet16i stride_multiplier =
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@ -561,7 +561,7 @@ EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
template <>
EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from,
Index stride) {
Packet8i stride_vector = _mm256_set1_epi32(stride);
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
@ -572,7 +572,7 @@ template <>
EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to,
const Packet16f& from,
Index stride) {
Packet16i stride_vector = _mm512_set1_epi32(stride);
Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
Packet16i stride_multiplier =
_mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
@ -582,7 +582,7 @@ template <>
EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to,
const Packet8d& from,
Index stride) {
Packet8i stride_vector = _mm256_set1_epi32(stride);
Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
_mm512_i32scatter_pd(to, indices, from, 8);
@ -660,8 +660,7 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
#ifdef EIGEN_VECTORIZE_AVX512DQ
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \
OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1);
OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
#else
#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
@ -855,7 +854,7 @@ template<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs)
final_1 = _mm256_add_pd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC));
__m512d final_output = _mm512_insertf64x4(final_output, final_0, 0);
__m512d final_output = _mm512_castpd256_pd512(final_0);
return _mm512_insertf64x4(final_output, final_1, 1);
}

View File

@ -542,11 +542,15 @@ template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* fr
}
template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
_mm256_store_si256((__m256i*)to, from.x);
// (void*) -> workaround clang warning:
// cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
_mm256_store_si256((__m256i*)(void*)to, from.x);
}
template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
_mm256_storeu_si256((__m256i*)to, from.x);
// (void*) -> workaround clang warning:
// cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
_mm256_storeu_si256((__m256i*)(void*)to, from.x);
}
template<> EIGEN_STRONG_INLINE Packet16h

View File

@ -390,6 +390,7 @@ public:
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
typedef LhsPacket LhsPacket4Packing;
typedef ResPacket AccPacket;
@ -496,6 +497,7 @@ public:
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
typedef LhsPacket LhsPacket4Packing;
typedef ResPacket AccPacket;
@ -626,6 +628,7 @@ public:
typedef typename packet_traits<Scalar>::type ScalarPacket;
typedef DoublePacket<RealPacket> DoublePacketType;
typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing;
typedef typename conditional<Vectorizable,RealPacket, Scalar>::type LhsPacket;
typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
@ -777,6 +780,7 @@ public:
typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
typedef LhsPacket LhsPacket4Packing;
typedef ResPacket AccPacket;
@ -1025,9 +1029,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
ResPacket R0, R1, R2;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
R2 = r0.loadPacket(2 * Traits::ResPacketSize);
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C4, alphav, R1);
traits.acc(C8, alphav, R2);
@ -1035,9 +1039,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
r0.storePacket(1 * Traits::ResPacketSize, R1);
r0.storePacket(2 * Traits::ResPacketSize, R2);
R0 = r1.loadPacket(0 * Traits::ResPacketSize);
R1 = r1.loadPacket(1 * Traits::ResPacketSize);
R2 = r1.loadPacket(2 * Traits::ResPacketSize);
R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
traits.acc(C1, alphav, R0);
traits.acc(C5, alphav, R1);
traits.acc(C9, alphav, R2);
@ -1045,9 +1049,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
r1.storePacket(1 * Traits::ResPacketSize, R1);
r1.storePacket(2 * Traits::ResPacketSize, R2);
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
R1 = r2.loadPacket(1 * Traits::ResPacketSize);
R2 = r2.loadPacket(2 * Traits::ResPacketSize);
R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
traits.acc(C2, alphav, R0);
traits.acc(C6, alphav, R1);
traits.acc(C10, alphav, R2);
@ -1055,9 +1059,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
r2.storePacket(1 * Traits::ResPacketSize, R1);
r2.storePacket(2 * Traits::ResPacketSize, R2);
R0 = r3.loadPacket(0 * Traits::ResPacketSize);
R1 = r3.loadPacket(1 * Traits::ResPacketSize);
R2 = r3.loadPacket(2 * Traits::ResPacketSize);
R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
traits.acc(C3, alphav, R0);
traits.acc(C7, alphav, R1);
traits.acc(C11, alphav, R2);
@ -1134,9 +1138,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
ResPacket R0, R1, R2;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
R2 = r0.loadPacket(2 * Traits::ResPacketSize);
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C4, alphav, R1);
traits.acc(C8, alphav, R2);
@ -1244,10 +1248,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
ResPacket R0, R1, R2, R3;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
R2 = r1.loadPacket(0 * Traits::ResPacketSize);
R3 = r1.loadPacket(1 * Traits::ResPacketSize);
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C4, alphav, R1);
traits.acc(C1, alphav, R2);
@ -1257,10 +1261,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
r1.storePacket(0 * Traits::ResPacketSize, R2);
r1.storePacket(1 * Traits::ResPacketSize, R3);
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
R1 = r2.loadPacket(1 * Traits::ResPacketSize);
R2 = r3.loadPacket(0 * Traits::ResPacketSize);
R3 = r3.loadPacket(1 * Traits::ResPacketSize);
R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
traits.acc(C2, alphav, R0);
traits.acc(C6, alphav, R1);
traits.acc(C3, alphav, R2);
@ -1337,8 +1341,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
ResPacket R0, R1;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
R1 = r0.loadPacket(1 * Traits::ResPacketSize);
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C4, alphav, R1);
r0.storePacket(0 * Traits::ResPacketSize, R0);
@ -1431,15 +1435,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
ResPacket R0, R1;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
R1 = r1.loadPacket(0 * Traits::ResPacketSize);
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
R1 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
traits.acc(C1, alphav, R1);
r0.storePacket(0 * Traits::ResPacketSize, R0);
r1.storePacket(0 * Traits::ResPacketSize, R1);
R0 = r2.loadPacket(0 * Traits::ResPacketSize);
R1 = r3.loadPacket(0 * Traits::ResPacketSize);
R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
R1 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
traits.acc(C2, alphav, R0);
traits.acc(C3, alphav, R1);
r2.storePacket(0 * Traits::ResPacketSize, R0);
@ -1504,7 +1508,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
#undef EIGEN_GEBGP_ONESTEP
ResPacket R0;
ResPacket alphav = pset1<ResPacket>(alpha);
R0 = r0.loadPacket(0 * Traits::ResPacketSize);
R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
traits.acc(C0, alphav, R0);
r0.storePacket(0 * Traits::ResPacketSize, R0);
}
@ -1685,19 +1689,18 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
//
// 32 33 34 35 ...
// 36 36 38 39 ...
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
{
typedef typename DataMapper::LinearMapper LinearMapper;
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
};
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
typedef typename packet_traits<Scalar>::type Packet;
enum { PacketSize = packet_traits<Scalar>::size };
enum { PacketSize = unpacket_traits<Packet>::size };
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
EIGEN_UNUSED_VARIABLE(stride);
@ -1725,9 +1728,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
for(Index k=0; k<depth; k++)
{
Packet A, B, C;
A = lhs.loadPacket(i+0*PacketSize, k);
B = lhs.loadPacket(i+1*PacketSize, k);
C = lhs.loadPacket(i+2*PacketSize, k);
A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
@ -1745,8 +1748,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
for(Index k=0; k<depth; k++)
{
Packet A, B;
A = lhs.loadPacket(i+0*PacketSize, k);
B = lhs.loadPacket(i+1*PacketSize, k);
A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
}
@ -1763,7 +1766,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
for(Index k=0; k<depth; k++)
{
Packet A;
A = lhs.loadPacket(i+0*PacketSize, k);
A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
pstore(blockA+count, cj.pconj(A));
count+=PacketSize;
}
@ -1793,19 +1796,18 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Co
}
}
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
{
typedef typename DataMapper::LinearMapper LinearMapper;
EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
};
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
{
typedef typename packet_traits<Scalar>::type Packet;
enum { PacketSize = packet_traits<Scalar>::size };
enum { PacketSize = unpacket_traits<Packet>::size };
EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
EIGEN_UNUSED_VARIABLE(stride);
@ -1837,7 +1839,7 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Ro
for (Index m = 0; m < pack; m += PacketSize)
{
PacketBlock<Packet> kernel;
for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
ptranspose(kernel);
for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
}
@ -1971,10 +1973,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
{
for(; k<peeled_k; k+=PacketSize) {
PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
kernel.packet[0] = dm0.loadPacket(k);
kernel.packet[1%PacketSize] = dm1.loadPacket(k);
kernel.packet[2%PacketSize] = dm2.loadPacket(k);
kernel.packet[3%PacketSize] = dm3.loadPacket(k);
kernel.packet[0 ] = dm0.template loadPacket<Packet>(k);
kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
ptranspose(kernel);
pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
@ -2075,7 +2077,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Co
for(Index k=0; k<depth; k++)
{
if (PacketSize==4) {
Packet A = rhs.loadPacket(k, j2);
Packet A = rhs.template loadPacket<Packet>(k, j2);
pstoreu(blockB+count, cj.pconj(A));
count += PacketSize;
} else {

View File

@ -75,7 +75,7 @@ static void run(Index rows, Index cols, Index depth,
Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction
Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction
gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;

View File

@ -84,7 +84,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
@ -110,7 +110,6 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
(std::min)(size,i2), alpha, -1, -1, 0, 0);
sybb(_res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
if (UpLo==Upper)
@ -160,7 +159,7 @@ struct tribb_kernel
if(UpLo==Upper)
gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha,
-1, -1, 0, 0);
// selfadjoint micro block
{
Index i = j;
@ -168,6 +167,7 @@ struct tribb_kernel
// 1 - apply the kernel on the temporary buffer
gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-1, -1, 0, 0);
// 2 - triangular accumulation
for(Index j1=0; j1<actualBlockSize; ++j1)
{

View File

@ -352,7 +352,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
for(Index k2=0; k2<size; k2+=kc)
{
@ -387,7 +387,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
for(Index i2=k2+kc; i2<size; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,size)-i2;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>()
(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
@ -437,7 +437,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=0; k2<size; k2+=kc)

View File

@ -151,7 +151,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
triangularBuffer.diagonal().setOnes();
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
for(Index k2=IsLower ? depth : 0;
@ -222,7 +222,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
for(Index i2=start; i2<end; i2+=mc)
{
const Index actual_mc = (std::min)(i2+mc,end)-i2;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>()
(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
@ -299,7 +299,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
triangularBuffer.diagonal().setOnes();
gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;

View File

@ -76,7 +76,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
conj_if<Conjugate> conj;
gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, TriStorageOrder> pack_lhs;
gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
// the goal here is to subdivise the Rhs panels such that we keep some cache
@ -229,7 +229,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor, false, true> pack_lhs_panel;
for(Index k2=IsLower ? size : 0;
IsLower ? k2>0 : k2<size;

View File

@ -24,7 +24,7 @@ struct gebp_kernel;
template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
struct gemm_pack_rhs;
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
struct gemm_pack_lhs;
template<
@ -156,11 +156,9 @@ class BlasVectorMapper {
};
template<typename Scalar, typename Index, int AlignmentType>
class BlasLinearMapper {
public:
typedef typename packet_traits<Scalar>::type Packet;
typedef typename packet_traits<Scalar>::half HalfPacket;
class BlasLinearMapper
{
public:
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
@ -171,29 +169,25 @@ class BlasLinearMapper {
return m_data[i];
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
return ploadt<Packet, AlignmentType>(m_data + i);
template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const {
return ploadt<PacketType, AlignmentType>(m_data + i);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
return ploadt<HalfPacket, AlignmentType>(m_data + i);
template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const {
pstoret<Scalar, PacketType, AlignmentType>(m_data + i, p);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const {
pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
}
protected:
protected:
Scalar *m_data;
};
// Lightweight helper class to access matrix coefficients.
template<typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned>
class blas_data_mapper {
public:
typedef typename packet_traits<Scalar>::type Packet;
typedef typename packet_traits<Scalar>::half HalfPacket;
class blas_data_mapper
{
public:
typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
typedef BlasVectorMapper<Scalar, Index> VectorMapper;
@ -218,8 +212,9 @@ class blas_data_mapper {
return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
return ploadt<Packet, AlignmentType>(&operator()(i, j));
template<typename PacketType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const {
return ploadt<PacketType, AlignmentType>(&operator()(i, j));
}
template <typename PacketT, int AlignmentT>
@ -227,10 +222,6 @@ class blas_data_mapper {
return ploadt<PacketT, AlignmentT>(&operator()(i, j));
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
return ploadt<HalfPacket, AlignmentType>(&operator()(i, j));
}
template<typename SubPacket>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const {
pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
@ -251,7 +242,7 @@ class blas_data_mapper {
return internal::first_default_aligned(m_data, size);
}
protected:
protected:
Scalar* EIGEN_RESTRICT m_data;
const Index m_stride;
};

View File

@ -198,7 +198,7 @@ ei_add_test(smallvectors)
ei_add_test(mapped_matrix)
ei_add_test(mapstride)
ei_add_test(mapstaticmethods)
ei_add_test(array)
ei_add_test(array_cwise)
ei_add_test(array_for_matrix)
ei_add_test(array_replicate)
ei_add_test(array_reverse)

View File

@ -453,7 +453,7 @@ template<typename ArrayType> void min_max(const ArrayType& m)
}
EIGEN_DECLARE_TEST(array)
EIGEN_DECLARE_TEST(array_cwise)
{
for(int i = 0; i < g_repeat; i++) {
CALL_SUBTEST_1( array(Array<float, 1, 1>()) );

View File

@ -148,24 +148,25 @@ template<typename Scalar> void packetmath()
for (int offset=0; offset<PacketSize; ++offset)
{
#define MIN(A,B) (A<B?A:B)
packets[0] = internal::pload<Packet>(data1);
packets[1] = internal::pload<Packet>(data1+PacketSize);
if (offset==0) internal::palign<0>(packets[0], packets[1]);
else if (offset==1) internal::palign<1>(packets[0], packets[1]);
else if (offset==2) internal::palign<2>(packets[0], packets[1]);
else if (offset==3) internal::palign<3>(packets[0], packets[1]);
else if (offset==4) internal::palign<4>(packets[0], packets[1]);
else if (offset==5) internal::palign<5>(packets[0], packets[1]);
else if (offset==6) internal::palign<6>(packets[0], packets[1]);
else if (offset==7) internal::palign<7>(packets[0], packets[1]);
else if (offset==8) internal::palign<8>(packets[0], packets[1]);
else if (offset==9) internal::palign<9>(packets[0], packets[1]);
else if (offset==10) internal::palign<10>(packets[0], packets[1]);
else if (offset==11) internal::palign<11>(packets[0], packets[1]);
else if (offset==12) internal::palign<12>(packets[0], packets[1]);
else if (offset==13) internal::palign<13>(packets[0], packets[1]);
else if (offset==14) internal::palign<14>(packets[0], packets[1]);
else if (offset==15) internal::palign<15>(packets[0], packets[1]);
else if (offset==1) internal::palign<MIN(1,PacketSize-1)>(packets[0], packets[1]);
else if (offset==2) internal::palign<MIN(2,PacketSize-1)>(packets[0], packets[1]);
else if (offset==3) internal::palign<MIN(3,PacketSize-1)>(packets[0], packets[1]);
else if (offset==4) internal::palign<MIN(4,PacketSize-1)>(packets[0], packets[1]);
else if (offset==5) internal::palign<MIN(5,PacketSize-1)>(packets[0], packets[1]);
else if (offset==6) internal::palign<MIN(6,PacketSize-1)>(packets[0], packets[1]);
else if (offset==7) internal::palign<MIN(7,PacketSize-1)>(packets[0], packets[1]);
else if (offset==8) internal::palign<MIN(8,PacketSize-1)>(packets[0], packets[1]);
else if (offset==9) internal::palign<MIN(9,PacketSize-1)>(packets[0], packets[1]);
else if (offset==10) internal::palign<MIN(10,PacketSize-1)>(packets[0], packets[1]);
else if (offset==11) internal::palign<MIN(11,PacketSize-1)>(packets[0], packets[1]);
else if (offset==12) internal::palign<MIN(12,PacketSize-1)>(packets[0], packets[1]);
else if (offset==13) internal::palign<MIN(13,PacketSize-1)>(packets[0], packets[1]);
else if (offset==14) internal::palign<MIN(14,PacketSize-1)>(packets[0], packets[1]);
else if (offset==15) internal::palign<MIN(15,PacketSize-1)>(packets[0], packets[1]);
internal::pstore(data2, packets[0]);
for (int i=0; i<PacketSize; ++i)

View File

@ -22,6 +22,14 @@
#include "main.h"
#include <typeinfo>
// Disable "ignoring attributes on template argument"
// for packet_traits<Packet*>
// => The only workaround would be to wrap _m128 and the likes
// within wrappers.
#if EIGEN_GNUC_AT_LEAST(6,0)
#pragma GCC diagnostic ignored "-Wignored-attributes"
#endif
using internal::demangle_flags;
using internal::demangle_traversal;
using internal::demangle_unrolling;

View File

@ -620,7 +620,7 @@ struct TensorContractionEvaluatorBase
typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
// Declare GEBP packing and kernel structs
internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> pack_lhs;
internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;

View File

@ -549,12 +549,12 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
#define prefetch_lhs(reg, row, col) \
if (!CHECK_LHS_BOUNDARY) { \
if (col < k_size) { \
reg =lhs.template loadPacket<Unaligned>(row, col); \
reg =lhs.template loadPacket<float4,Unaligned>(row, col); \
} \
} else { \
if (col < k_size) { \
if (row + 3 < m_size) { \
reg =lhs.template loadPacket<Unaligned>(row, col); \
reg =lhs.template loadPacket<float4,Unaligned>(row, col); \
} else if (row + 2 < m_size) { \
reg.x =lhs(row + 0, col); \
reg.y =lhs(row + 1, col); \
@ -584,7 +584,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
if (!CHECK_RHS_BOUNDARY) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
} else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@ -599,7 +599,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
} else {
if (rhs_horiz0 < n_size) {
if ((rhs_vert + 3) < k_size) {
rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
} else if ((rhs_vert + 2) < k_size) {
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
@ -799,37 +799,37 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
if (!CHECK_LHS_BOUNDARY) {
if ((threadIdx.y/4+k+24) < k_size) {
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
} else if ((threadIdx.y/4+k+16) < k_size) {
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
} else if ((threadIdx.y/4+k+8) < k_size) {
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
} else if ((threadIdx.y/4+k) < k_size) {
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
}
} else {
// just CHECK_LHS_BOUNDARY
if (lhs_vert + 3 < m_size) {
if ((threadIdx.y/4+k+24) < k_size) {
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
lhs_pf3 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
} else if ((threadIdx.y/4+k+16) < k_size) {
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
lhs_pf2 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
} else if ((threadIdx.y/4+k+8) < k_size) {
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
} else if ((threadIdx.y/4+k) < k_size) {
lhs_pf0 =lhs.template loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
}
} else if (lhs_vert + 2 < m_size) {
if ((threadIdx.y/4+k+24) < k_size) {
@ -918,8 +918,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
if (!CHECK_RHS_BOUNDARY) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
} else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@ -941,8 +941,8 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
if (rhs_horiz1 < n_size) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
rhs_pf1 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
} else if (rhs_vert + 2 < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
@ -963,7 +963,7 @@ EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
} else if (rhs_horiz0 < n_size) {
if ((rhs_vert + 3) < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0 = rhs.template loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
} else if ((rhs_vert + 2) < k_size) {
// just CHECK_RHS_BOUNDARY
rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);

View File

@ -238,9 +238,6 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
const contract_t& k_strides) :
ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
typedef typename Tensor::PacketReturnType Packet;
typedef typename unpacket_traits<Packet>::half HalfPacket;
template <typename PacketT,int AlignmentType>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
@ -284,27 +281,10 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
return pload<PacketT>(data);
}
template <int AlignmentType>
template <typename PacketT,int AlignmentType>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
return this->load<Packet,AlignmentType>(i,j);
}
template <int AlignmentType>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
// whole method makes column major assumption
// don't need to add offsets for now (because operator handles that)
const Index half_packet_size = unpacket_traits<HalfPacket>::size;
if (half_packet_size == packet_size) {
return loadPacket<AlignmentType>(i, j);
}
EIGEN_ALIGN_MAX Scalar data[half_packet_size];
for (Index k = 0; k < half_packet_size; k++) {
data[k] = operator()(i + k, j);
}
return pload<HalfPacket>(data);
EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
return this->load<PacketT,AlignmentType>(i,j);
}
};
@ -314,7 +294,8 @@ template<typename Scalar, typename Index, int side,
typename nocontract_t, typename contract_t,
bool inner_dim_contiguous,
bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_>
class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
: public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_>
{
public:
typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
@ -327,12 +308,11 @@ class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, con
const contract_t& k_strides) :
ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
typedef typename Tensor::PacketReturnType Packet;
template <int> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
template <typename PacketT,int> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
EIGEN_ALIGN_MAX Scalar data[1];
data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
return pload<typename Tensor::PacketReturnType>(data);
return pload<PacketT>(data);
}
template <typename PacketT,int> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
@ -340,10 +320,6 @@ class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, con
data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
return pload<PacketT>(data);
}
template <int> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
return loadPacket(i, j);
}
};
@ -354,8 +330,6 @@ template<typename Scalar, typename Index, int side,
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
class TensorContractionSubMapper {
public:
typedef typename Tensor::PacketReturnType Packet;
typedef typename unpacket_traits<Packet>::half HalfPacket;
typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> ParentMapper;
typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Self;
@ -390,17 +364,20 @@ class TensorContractionSubMapper {
return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
template <typename PacketT>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const {
if (UseDirectOffsets) {
return m_base_mapper.template loadPacket<Alignment>(i, 0);
return m_base_mapper.template loadPacket<PacketT,Alignment>(i, 0);
}
return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, m_horiz_offset);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
template <typename PacketT>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
if (UseDirectOffsets) {
return m_base_mapper.template loadPacket<Alignment>(i, j);
return m_base_mapper.template loadPacket<PacketT,Alignment>(i, j);
}
return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset);
return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, j + m_horiz_offset);
}
template <typename PacketT, int AlignmentType>
@ -411,14 +388,8 @@ class TensorContractionSubMapper {
return m_base_mapper.template loadPacket<PacketT,AlignmentType>(i + m_vert_offset, j + m_horiz_offset);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
if (UseDirectOffsets) {
return m_base_mapper.template loadHalfPacket<Alignment>(i, 0);
}
return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet& p) const {
template <typename PacketT>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const {
if (UseDirectOffsets) {
m_base_mapper.storePacket(i, 0, p);
}
@ -434,15 +405,15 @@ class TensorContractionSubMapper {
template <typename PacketT, int AlignmentType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::is_same<PacketT, PacketT>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
if (UseDirectOffsets) {
return m_base_mapper.template loadPacket<ActualAlignment>(i, 0);
return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i, 0);
}
return m_base_mapper.template loadPacket<ActualAlignment>(i + m_vert_offset, m_horiz_offset);
return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i + m_vert_offset, m_horiz_offset);
}
template <typename Packet>
template <typename PacketT>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const {
return false;
}

View File

@ -244,7 +244,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
RhsMapper;
typedef internal::gemm_pack_lhs<LhsScalar, Index,
typename LhsMapper::SubMapper, Traits::mr,
Traits::LhsProgress, ColMajor>
Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor>
LhsPacker;
typedef internal::gemm_pack_rhs<
RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>

View File

@ -21,9 +21,9 @@ namespace Eigen {
template <typename T, size_t n> class array {
public:
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; }
EIGEN_STRONG_INLINE T& operator[] (size_t index) { eigen_internal_assert(index < size()); return values[index]; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }
EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { eigen_internal_assert(index < size()); return values[index]; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE T& at(size_t index) { eigen_assert(index < size()); return values[index]; }

View File

@ -162,6 +162,7 @@ int main()
typedef ResScalar LhsPacket;
typedef ResScalar RhsPacket;
typedef ResScalar ResPacket;
typedef LhsPacket LhsPacket4Packing;
};

View File

@ -83,10 +83,10 @@ static void test_expr_shuffling()
Tensor<float, 4, DataLayout> result(5, 7, 3, 2);
array<ptrdiff_t, 4> src_slice_dim({2, 3, 1, 7});
array<ptrdiff_t, 4> src_slice_start({0, 0, 0, 0});
array<ptrdiff_t, 4> dst_slice_dim({1, 7, 3, 2});
array<ptrdiff_t, 4> dst_slice_start({0, 0, 0, 0});
array<ptrdiff_t, 4> src_slice_dim{{2, 3, 1, 7}};
array<ptrdiff_t, 4> src_slice_start{{0, 0, 0, 0}};
array<ptrdiff_t, 4> dst_slice_dim{{1, 7, 3, 2}};
array<ptrdiff_t, 4> dst_slice_start{{0, 0, 0, 0}};
for (int i = 0; i < 5; ++i) {
result.slice(dst_slice_start, dst_slice_dim) =