mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-02-23 18:20:47 +08:00
Pulled latest updates from trunk
This commit is contained in:
commit
306fceccbe
@ -227,7 +227,7 @@ if(NOT MSVC)
|
||||
|
||||
option(EIGEN_TEST_NEON "Enable/Disable Neon in tests/examples" OFF)
|
||||
if(EIGEN_TEST_NEON)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mcpu=cortex-a8")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mfloat-abi=softfp")
|
||||
message(STATUS "Enabling NEON in tests/examples")
|
||||
endif()
|
||||
|
||||
|
@ -313,7 +313,8 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet&
|
||||
template<size_t offset, typename Packet>
|
||||
struct protate_impl
|
||||
{
|
||||
static Packet run(const Packet& a) { return a; }
|
||||
// Empty so attempts to use this unimplemented path will fail to compile.
|
||||
// Only specializations of this template should be used.
|
||||
};
|
||||
|
||||
/** \internal \returns a packet with the coefficients rotated to the right in little-endian convention,
|
||||
@ -322,7 +323,6 @@ struct protate_impl
|
||||
*/
|
||||
template<size_t offset, typename Packet> EIGEN_DEVICE_FUNC inline Packet protate(const Packet& a)
|
||||
{
|
||||
EIGEN_STATIC_ASSERT(offset < unpacket_traits<Packet>::size, ROTATION_BY_ILLEGAL_OFFSET);
|
||||
return offset ? protate_impl<offset, Packet>::run(a) : a;
|
||||
}
|
||||
|
||||
|
@ -76,12 +76,12 @@ typedef uint32x4_t Packet4ui;
|
||||
template<> struct packet_traits<float> : default_packet_traits
|
||||
{
|
||||
typedef Packet4f type;
|
||||
typedef Packet2f half;
|
||||
typedef Packet4f half; // Packet2f intrinsics not implemented yet
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
size = 4,
|
||||
HasHalfPacket=1,
|
||||
HasHalfPacket=0, // Packet2f intrinsics not implemented yet
|
||||
|
||||
HasDiv = 1,
|
||||
// FIXME check the Has*
|
||||
@ -95,12 +95,12 @@ template<> struct packet_traits<float> : default_packet_traits
|
||||
template<> struct packet_traits<int> : default_packet_traits
|
||||
{
|
||||
typedef Packet4i type;
|
||||
typedef Packet2i half;
|
||||
typedef Packet4i half; // Packet2i intrinsics not implemented yet
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
size=4,
|
||||
HasHalfPacket=1
|
||||
HasHalfPacket=0 // Packet2i intrinsics not implemented yet
|
||||
// FIXME check the Has*
|
||||
};
|
||||
};
|
||||
|
@ -88,15 +88,15 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
|
||||
#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
|
||||
EIGEN_UNUSED_VARIABLE(num_threads);
|
||||
enum {
|
||||
kr = 16,
|
||||
kr = 8,
|
||||
mr = Traits::mr,
|
||||
nr = Traits::nr
|
||||
};
|
||||
k = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
|
||||
if (k > kr) k -= k % kr;
|
||||
m = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
|
||||
m = std::min<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
|
||||
if (m > mr) m -= m % mr;
|
||||
n = std::min<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
|
||||
n = std::min<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
|
||||
if (n > nr) n -= n % nr;
|
||||
return;
|
||||
#endif
|
||||
@ -153,16 +153,104 @@ void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads
|
||||
}
|
||||
else {
|
||||
// In unit tests we do not want to use extra large matrices,
|
||||
// so we reduce the block size to check the blocking strategy is not flawed
|
||||
#ifndef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
|
||||
k = std::min<Index>(k,sizeof(LhsScalar)<=4 ? 360 : 240);
|
||||
n = std::min<Index>(n,3840/sizeof(RhsScalar));
|
||||
m = std::min<Index>(m,3840/sizeof(RhsScalar));
|
||||
#else
|
||||
k = std::min<Index>(k,24);
|
||||
n = std::min<Index>(n,384/sizeof(RhsScalar));
|
||||
m = std::min<Index>(m,384/sizeof(RhsScalar));
|
||||
// so we reduce the cache size to check the blocking strategy is not flawed
|
||||
#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
|
||||
l1 = 4*1024;
|
||||
l2 = 32*1024;
|
||||
l3 = 512*1024;
|
||||
#endif
|
||||
|
||||
// Early return for small problems because the computation below are time consuming for small problems.
|
||||
// Perhaps it would make more sense to consider k*n*m??
|
||||
// Note that for very tiny problem, this function should be bypassed anyway
|
||||
// because we use the coefficient-based implementation for them.
|
||||
if(std::max(k,std::max(m,n))<48)
|
||||
return;
|
||||
|
||||
typedef typename Traits::ResScalar ResScalar;
|
||||
enum {
|
||||
k_peeling = 8,
|
||||
k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
|
||||
k_sub = Traits::mr * Traits::nr * sizeof(ResScalar)
|
||||
};
|
||||
|
||||
// ---- 1st level of blocking on L1, yields kc ----
|
||||
|
||||
// Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
|
||||
// of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
|
||||
// We also include a register-level block of the result (mx x nr).
|
||||
// (In an ideal world only the lhs panel would stay in L1)
|
||||
// Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
|
||||
const Index max_kc = ((l1-k_sub)/k_div) & (~(k_peeling-1));
|
||||
const Index old_k = k;
|
||||
if(k>max_kc)
|
||||
{
|
||||
// We are really blocking on the third dimension:
|
||||
// -> reduce blocking size to make sure the last block is as large as possible
|
||||
// while keeping the same number of sweeps over the result.
|
||||
k = (k%max_kc)==0 ? max_kc
|
||||
: max_kc - k_peeling * ((max_kc-1-(k%max_kc))/(k_peeling*(k/max_kc+1)));
|
||||
|
||||
eigen_internal_assert(((old_k/k) == (old_k/max_kc)) && "the number of sweeps has to remain the same");
|
||||
}
|
||||
|
||||
// ---- 2nd level of blocking on max(L2,L3), yields nc ----
|
||||
|
||||
// TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
|
||||
// actual_l2 = max(l2, l3/nb_core_sharing_l3)
|
||||
// The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
|
||||
// For instance, it corresponds to 6MB of L3 shared among 4 cores.
|
||||
#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
|
||||
const Index actual_l2 = l3;
|
||||
#else
|
||||
const Index actual_l2 = 1572864; // == 1.5 MB
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
// Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
|
||||
// The second half is implicitly reserved to access the result and lhs coefficients.
|
||||
// When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
|
||||
// to limit this growth: we bound nc to growth by a factor x1.5, leading to:
|
||||
const Index max_nc = (3*actual_l2)/(2*2*max_kc*sizeof(RhsScalar));
|
||||
// WARNING Below, we assume that Traits::nr is a power of two.
|
||||
Index nc = std::min<Index>(actual_l2/(2*k*sizeof(RhsScalar)), max_nc) & (~(Traits::nr-1));
|
||||
if(n>nc)
|
||||
{
|
||||
// We are really blocking over the columns:
|
||||
// -> reduce blocking size to make sure the last block is as large as possible
|
||||
// while keeping the same number of sweeps over the packed lhs.
|
||||
// Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
|
||||
n = (n%nc)==0 ? nc
|
||||
: (nc - Traits::nr * ((nc/*-1*/-(n%nc))/(Traits::nr*(n/nc+1))));
|
||||
}
|
||||
else if(old_k==k)
|
||||
{
|
||||
// So far, no blocking at all, i.e., kc==k, and nc==n.
|
||||
// In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
|
||||
Index problem_size = k*n*sizeof(LhsScalar);
|
||||
Index actual_lm = actual_l2;
|
||||
Index max_mc = m;
|
||||
if(problem_size<=1024)
|
||||
{
|
||||
// problem is small enough to keep in L1
|
||||
// Let's choose m such that lhs's block fit in 1/3 of L1
|
||||
actual_lm = l1;
|
||||
}
|
||||
else if(l3!=0 && problem_size<=32768)
|
||||
{
|
||||
// we have both L2 and L3, and problem is small enough to be kept in L2
|
||||
// Let's choose m such that lhs's block fit in 1/3 of L2
|
||||
actual_lm = l2;
|
||||
max_mc = 576;
|
||||
}
|
||||
|
||||
Index mc = (std::min<Index>)(actual_lm/(3*k*sizeof(LhsScalar)), max_mc);
|
||||
if (mc > Traits::mr) mc -= mc % Traits::mr;
|
||||
|
||||
m = (m%mc)==0 ? mc
|
||||
: (mc - Traits::mr * ((mc/*-1*/-(m%mc))/(Traits::mr*(m/mc+1))));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -712,6 +800,80 @@ protected:
|
||||
conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
|
||||
};
|
||||
|
||||
// helper for the rotating kernel below
|
||||
template <typename GebpKernel, bool UseRotatingKernel = GebpKernel::UseRotatingKernel>
|
||||
struct PossiblyRotatingKernelHelper
|
||||
{
|
||||
// default implementation, not rotating
|
||||
|
||||
typedef typename GebpKernel::Traits Traits;
|
||||
typedef typename Traits::RhsScalar RhsScalar;
|
||||
typedef typename Traits::RhsPacket RhsPacket;
|
||||
typedef typename Traits::AccPacket AccPacket;
|
||||
|
||||
const Traits& traits;
|
||||
PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
|
||||
|
||||
|
||||
template <size_t K, size_t Index>
|
||||
void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
|
||||
{
|
||||
traits.loadRhs(from + (Index+4*K)*Traits::RhsProgress, to);
|
||||
}
|
||||
|
||||
void unrotateResult(AccPacket&,
|
||||
AccPacket&,
|
||||
AccPacket&,
|
||||
AccPacket&)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
// rotating implementation
|
||||
template <typename GebpKernel>
|
||||
struct PossiblyRotatingKernelHelper<GebpKernel, true>
|
||||
{
|
||||
typedef typename GebpKernel::Traits Traits;
|
||||
typedef typename Traits::RhsScalar RhsScalar;
|
||||
typedef typename Traits::RhsPacket RhsPacket;
|
||||
typedef typename Traits::AccPacket AccPacket;
|
||||
|
||||
const Traits& traits;
|
||||
PossiblyRotatingKernelHelper(const Traits& t) : traits(t) {}
|
||||
|
||||
template <size_t K, size_t Index>
|
||||
void loadOrRotateRhs(RhsPacket& to, const RhsScalar* from) const
|
||||
{
|
||||
if (Index == 0) {
|
||||
to = pload<RhsPacket>(from + 4*K*Traits::RhsProgress);
|
||||
} else {
|
||||
EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers");
|
||||
to = protate<1>(to);
|
||||
}
|
||||
}
|
||||
|
||||
void unrotateResult(AccPacket& res0,
|
||||
AccPacket& res1,
|
||||
AccPacket& res2,
|
||||
AccPacket& res3)
|
||||
{
|
||||
PacketBlock<AccPacket> resblock;
|
||||
resblock.packet[0] = res0;
|
||||
resblock.packet[1] = res1;
|
||||
resblock.packet[2] = res2;
|
||||
resblock.packet[3] = res3;
|
||||
ptranspose(resblock);
|
||||
resblock.packet[3] = protate<1>(resblock.packet[3]);
|
||||
resblock.packet[2] = protate<2>(resblock.packet[2]);
|
||||
resblock.packet[1] = protate<3>(resblock.packet[1]);
|
||||
ptranspose(resblock);
|
||||
res0 = resblock.packet[0];
|
||||
res1 = resblock.packet[1];
|
||||
res2 = resblock.packet[2];
|
||||
res3 = resblock.packet[3];
|
||||
}
|
||||
};
|
||||
|
||||
/* optimized GEneral packed Block * packed Panel product kernel
|
||||
*
|
||||
* Mixing type logic: C += A * B
|
||||
@ -745,6 +907,16 @@ struct gebp_kernel
|
||||
ResPacketSize = Traits::ResPacketSize
|
||||
};
|
||||
|
||||
|
||||
static const bool UseRotatingKernel =
|
||||
EIGEN_ARCH_ARM &&
|
||||
internal::is_same<LhsScalar, float>::value &&
|
||||
internal::is_same<RhsScalar, float>::value &&
|
||||
internal::is_same<ResScalar, float>::value &&
|
||||
Traits::LhsPacketSize == 4 &&
|
||||
Traits::RhsPacketSize == 4 &&
|
||||
Traits::ResPacketSize == 4;
|
||||
|
||||
EIGEN_DONT_INLINE
|
||||
void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
|
||||
Index rows, Index depth, Index cols, ResScalar alpha,
|
||||
@ -778,6 +950,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
||||
// Usually, make sense only with FMA
|
||||
if(mr>=3*Traits::LhsProgress)
|
||||
{
|
||||
PossiblyRotatingKernelHelper<gebp_kernel> possiblyRotatingKernelHelper(traits);
|
||||
|
||||
// loops on each largest micro horizontal panel of lhs (3*Traits::LhsProgress x depth)
|
||||
for(Index i=0; i<peeled_mc3; i+=3*Traits::LhsProgress)
|
||||
{
|
||||
@ -813,43 +987,12 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
||||
prefetch(&blB[0]);
|
||||
LhsPacket A0, A1;
|
||||
|
||||
#define EIGEN_ARCH_PREFERS_ROTATING_KERNEL EIGEN_ARCH_ARM
|
||||
|
||||
#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL
|
||||
static const bool UseRotatingKernel =
|
||||
Traits::LhsPacketSize == 4 &&
|
||||
Traits::RhsPacketSize == 4 &&
|
||||
Traits::ResPacketSize == 4;
|
||||
#endif
|
||||
|
||||
for(Index k=0; k<peeled_kc; k+=pk)
|
||||
{
|
||||
EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
|
||||
RhsPacket B_0, T0;
|
||||
LhsPacket A2;
|
||||
|
||||
#define EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N) \
|
||||
traits.loadRhs(&blB[(N+4*K)*RhsProgress], B_0);
|
||||
|
||||
#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL
|
||||
#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \
|
||||
do { \
|
||||
if (UseRotatingKernel) { \
|
||||
if (N == 0) { \
|
||||
B_0 = pload<RhsPacket>(&blB[(0+4*K)*RhsProgress]); \
|
||||
} else { \
|
||||
EIGEN_ASM_COMMENT("Do not reorder code, we're very tight on registers"); \
|
||||
B_0 = protate<1>(B_0); \
|
||||
} \
|
||||
} else { \
|
||||
EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N); \
|
||||
} \
|
||||
} while (false)
|
||||
#else
|
||||
#define EIGEN_GEBP_ONESTEP_LOADRHS(K,N) \
|
||||
EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING(K,N)
|
||||
#endif
|
||||
|
||||
#define EIGEN_GEBP_ONESTEP(K) \
|
||||
do { \
|
||||
EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
|
||||
@ -859,19 +1002,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
||||
traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \
|
||||
traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \
|
||||
traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \
|
||||
EIGEN_GEBP_ONESTEP_LOADRHS(K, 0); \
|
||||
possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 0>(B_0, blB); \
|
||||
traits.madd(A0, B_0, C0, T0); \
|
||||
traits.madd(A1, B_0, C4, T0); \
|
||||
traits.madd(A2, B_0, C8, B_0); \
|
||||
EIGEN_GEBP_ONESTEP_LOADRHS(K, 1); \
|
||||
possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 1>(B_0, blB); \
|
||||
traits.madd(A0, B_0, C1, T0); \
|
||||
traits.madd(A1, B_0, C5, T0); \
|
||||
traits.madd(A2, B_0, C9, B_0); \
|
||||
EIGEN_GEBP_ONESTEP_LOADRHS(K, 2); \
|
||||
possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 2>(B_0, blB); \
|
||||
traits.madd(A0, B_0, C2, T0); \
|
||||
traits.madd(A1, B_0, C6, T0); \
|
||||
traits.madd(A2, B_0, C10, B_0); \
|
||||
EIGEN_GEBP_ONESTEP_LOADRHS(K, 3); \
|
||||
possiblyRotatingKernelHelper.template loadOrRotateRhs<K, 3>(B_0, blB); \
|
||||
traits.madd(A0, B_0, C3 , T0); \
|
||||
traits.madd(A1, B_0, C7, T0); \
|
||||
traits.madd(A2, B_0, C11, B_0); \
|
||||
@ -904,34 +1047,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
|
||||
}
|
||||
|
||||
#undef EIGEN_GEBP_ONESTEP
|
||||
#undef EIGEN_GEBP_ONESTEP_LOADRHS
|
||||
#undef EIGEN_GEBP_ONESTEP_LOADRHS_NONROTATING
|
||||
|
||||
#if EIGEN_ARCH_PREFERS_ROTATING_KERNEL
|
||||
if (UseRotatingKernel) {
|
||||
#define EIGEN_GEBP_UNROTATE_RESULT(res0, res1, res2, res3) \
|
||||
do { \
|
||||
PacketBlock<ResPacket> resblock; \
|
||||
resblock.packet[0] = res0; \
|
||||
resblock.packet[1] = res1; \
|
||||
resblock.packet[2] = res2; \
|
||||
resblock.packet[3] = res3; \
|
||||
ptranspose(resblock); \
|
||||
resblock.packet[3] = protate<1>(resblock.packet[3]); \
|
||||
resblock.packet[2] = protate<2>(resblock.packet[2]); \
|
||||
resblock.packet[1] = protate<3>(resblock.packet[1]); \
|
||||
ptranspose(resblock); \
|
||||
res0 = resblock.packet[0]; \
|
||||
res1 = resblock.packet[1]; \
|
||||
res2 = resblock.packet[2]; \
|
||||
res3 = resblock.packet[3]; \
|
||||
} while (false)
|
||||
|
||||
EIGEN_GEBP_UNROTATE_RESULT(C0, C1, C2, C3);
|
||||
EIGEN_GEBP_UNROTATE_RESULT(C4, C5, C6, C7);
|
||||
EIGEN_GEBP_UNROTATE_RESULT(C8, C9, C10, C11);
|
||||
}
|
||||
#endif
|
||||
possiblyRotatingKernelHelper.unrotateResult(C0, C1, C2, C3);
|
||||
possiblyRotatingKernelHelper.unrotateResult(C4, C5, C6, C7);
|
||||
possiblyRotatingKernelHelper.unrotateResult(C8, C9, C10, C11);
|
||||
|
||||
ResPacket R0, R1, R2;
|
||||
ResPacket alphav = pset1<ResPacket>(alpha);
|
||||
|
@ -164,6 +164,8 @@ static void run(Index rows, Index cols, Index depth,
|
||||
|
||||
ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
|
||||
ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
|
||||
|
||||
const bool pack_rhs_once = mc!=rows && kc==depth && nc==cols;
|
||||
|
||||
// For each horizontal panel of the rhs, and corresponding panel of the lhs...
|
||||
for(Index i2=0; i2<rows; i2+=mc)
|
||||
@ -188,7 +190,8 @@ static void run(Index rows, Index cols, Index depth,
|
||||
// We pack the rhs's block into a sequential chunk of memory (L2 caching)
|
||||
// Note that this block will be read a very high number of times, which is equal to the number of
|
||||
// micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
|
||||
pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
|
||||
if((!pack_rhs_once) || i2==0)
|
||||
pack_rhs(blockB, rhs.getSubMapper(k2,j2), actual_kc, actual_nc);
|
||||
|
||||
// Everything is packed, we can now call the panel * block kernel:
|
||||
gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
|
||||
|
@ -93,8 +93,7 @@
|
||||
THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH,
|
||||
OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG,
|
||||
IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY,
|
||||
STORAGE_LAYOUT_DOES_NOT_MATCH,
|
||||
ROTATION_BY_ILLEGAL_OFFSET
|
||||
STORAGE_LAYOUT_DOES_NOT_MATCH
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -18,8 +18,22 @@ before-evaluators
|
||||
6334:f6a45e5b8b7c
|
||||
6639:c9121c60b5c7
|
||||
6655:06f163b5221f
|
||||
6677:700e023044e7
|
||||
6677:700e023044e7 # FMA has been wrongly disabled
|
||||
6681:11d31dafb0e3
|
||||
6844:039efd86b75c
|
||||
6845:7333ed40c6ef
|
||||
6911:6192dd812d84
|
||||
6699:5e6e8e10aad1 # merge default to tensors
|
||||
6726:ff2d2388e7b9 # merge default to tensors
|
||||
6742:0cbd6195e829 # merge default to tensors
|
||||
6747:853d2bafeb8f # Generalized the gebp apis
|
||||
6765:71584fd55762 # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation
|
||||
6781:9cc5a931b2c6 # generalized gemv
|
||||
6792:f6e1daab600a # ensured that contractions that can be reduced to a matrix vector product
|
||||
6844:039efd86b75c # merge tensor
|
||||
6845:7333ed40c6ef # change prefetching in gebp
|
||||
6856:b5be5e10eb7f # merge index conversion
|
||||
6893:c3a64aba7c70 # clean blocking size computation
|
||||
6898:6fb31ebe6492 # rotating kernel for ARM
|
||||
6899:877facace746 # rotating kernel for ARM only
|
||||
6904:c250623ae9fa # result_of
|
||||
6921:915f1b1fc158 # fix prefetching change for ARM
|
||||
6923:9ff25f6dacc6 # prefetching
|
||||
6933:52572e60b5d3 # blocking size strategy
|
@ -1,5 +1,25 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Examples of environment variables to be set:
|
||||
# PREFIX="haswell-fma-"
|
||||
# CXX_FLAGS="-mfma"
|
||||
|
||||
# Options:
|
||||
# -up : enforce the recomputation of existing data, and keep best results as a merging strategy
|
||||
|
||||
|
||||
if echo "$*" | grep '\-up' > /dev/null; then
|
||||
update=true
|
||||
else
|
||||
update=false
|
||||
fi
|
||||
|
||||
if [ $update == true ]; then
|
||||
echo "(Re-)Compute all changesets and keep bests"
|
||||
else
|
||||
echo "Skip previously computed changesets"
|
||||
fi
|
||||
|
||||
if [ ! -d "eigen_src" ]; then
|
||||
hg clone https://bitbucket.org/eigen/eigen eigen_src
|
||||
fi
|
||||
@ -8,9 +28,32 @@ if [ ! -z '$CXX' ]; then
|
||||
CXX=g++
|
||||
fi
|
||||
|
||||
rm sgemm.out
|
||||
rm dgemm.out
|
||||
rm cgemm.out
|
||||
function make_backup
|
||||
{
|
||||
if [ -f "$1.out" ]; then
|
||||
mv "$1.out" "$1.backup"
|
||||
fi
|
||||
}
|
||||
|
||||
function merge
|
||||
{
|
||||
count1=`echo $1 | wc -w`
|
||||
count2=`echo $2 | wc -w`
|
||||
|
||||
if [ $count1 == $count2 ]; then
|
||||
a=( $1 ); b=( $2 )
|
||||
res=""
|
||||
for (( i=0 ; i<$count1 ; i++ )); do
|
||||
ai=${a[$i]}; bi=${b[$i]}
|
||||
tmp=`echo "if ($ai > $bi) $ai else $bi " | bc -l`
|
||||
res="$res $tmp"
|
||||
done
|
||||
echo $res
|
||||
|
||||
else
|
||||
echo $1
|
||||
fi
|
||||
}
|
||||
|
||||
function test_current
|
||||
{
|
||||
@ -18,16 +61,32 @@ function test_current
|
||||
scalar=$2
|
||||
name=$3
|
||||
|
||||
if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src gemm.cpp -DSCALAR=$scalar -o $name; then
|
||||
res=`./$name`
|
||||
echo $res
|
||||
echo "$rev $res" >> $name.out
|
||||
prev=`grep $rev "$name.backup" | cut -c 14-`
|
||||
res=$prev
|
||||
count_rev=`echo $prev | wc -w`
|
||||
count_ref=`cat "settings.txt" | wc -l`
|
||||
if [ $update == true ] || [ $count_rev != $count_ref ]; then
|
||||
if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src gemm.cpp -DSCALAR=$scalar -o $name; then
|
||||
curr=`./$name`
|
||||
echo merge $prev
|
||||
echo with $curr
|
||||
res=`merge "$curr" "$prev"`
|
||||
echo $res
|
||||
echo "$rev $res" >> $name.out
|
||||
else
|
||||
echo "Compilation failed, skip rev $rev"
|
||||
fi
|
||||
else
|
||||
echo "Compilation failed, skip rev $rev"
|
||||
echo "Skip existing results for $rev / $name"
|
||||
echo "$rev $res" >> $name.out
|
||||
fi
|
||||
}
|
||||
|
||||
while read rev
|
||||
make_backup $PREFIX"sgemm"
|
||||
make_backup $PREFIX"dgemm"
|
||||
make_backup $PREFIX"cgemm"
|
||||
|
||||
cut -f1 -d"#" < changesets.txt | while read rev
|
||||
do
|
||||
if [ ! -z '$rev' ]; then
|
||||
echo "Testing rev $rev"
|
||||
@ -36,27 +95,27 @@ do
|
||||
actual_rev=`hg identify | cut -f1 -d' '`
|
||||
cd ..
|
||||
|
||||
test_current $actual_rev float sgemm
|
||||
test_current $actual_rev double dgemm
|
||||
test_current $actual_rev "std::complex<double>" cgemm
|
||||
test_current $actual_rev float $PREFIX"sgemm"
|
||||
test_current $actual_rev double $PREFIX"dgemm"
|
||||
test_current $actual_rev "std::complex<double>" $PREFIX"cgemm"
|
||||
fi
|
||||
|
||||
done < changesets.txt
|
||||
done
|
||||
|
||||
echo "Float:"
|
||||
cat sgemm.out
|
||||
cat $PREFIX"sgemm.out"
|
||||
echo ""
|
||||
|
||||
echo "Double:"
|
||||
cat dgemm.out
|
||||
cat $PREFIX"dgemm.out"
|
||||
echo ""
|
||||
|
||||
echo "Complex:"
|
||||
cat cgemm.out
|
||||
cat $PREFIX"cgemm.out"
|
||||
echo ""
|
||||
|
||||
./make_plot.sh sgemm
|
||||
./make_plot.sh dgemm
|
||||
./make_plot.sh cgemm
|
||||
./make_plot.sh $PREFIX"sgemm"
|
||||
./make_plot.sh $PREFIX"dgemm"
|
||||
./make_plot.sh $PREFIX"cgemm"
|
||||
|
||||
|
||||
|
@ -555,6 +555,11 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
|
||||
chip(const Index offset, const Index dim) const {
|
||||
return TensorChippingOp<Dynamic, Derived>(derived(), offset, dim);
|
||||
}
|
||||
template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
TensorReverseOp<const ReverseDimensions, Derived>
|
||||
reverse(const ReverseDimensions& rev) const {
|
||||
return TensorReverseOp<const ReverseDimensions, Derived>(derived(), rev);
|
||||
}
|
||||
template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
TensorShufflingOp<const Shuffle, Derived>
|
||||
shuffle(const Shuffle& shuffle) const {
|
||||
|
@ -249,7 +249,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
innermostLoc = index;
|
||||
} else {
|
||||
if (internal::index_statically_eq<InputDimensions>()(0, 1)) {
|
||||
eigen_assert(innermostLoc % m_impl.dimensions()[0] == 0);
|
||||
eigen_assert(index % m_impl.dimensions()[0] == 0);
|
||||
innermostLoc = 0;
|
||||
} else {
|
||||
innermostLoc = index % m_impl.dimensions()[0];
|
||||
@ -302,7 +302,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
||||
innermostLoc = index;
|
||||
} else {
|
||||
if (internal::index_statically_eq<InputDimensions>()(NumDims-1, 1)) {
|
||||
eigen_assert(innermostLoc % m_impl.dimensions()[NumDims-1] == 0);
|
||||
eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
|
||||
innermostLoc = 0;
|
||||
} else {
|
||||
innermostLoc = index % m_impl.dimensions()[NumDims-1];
|
||||
|
@ -174,8 +174,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
|
||||
OutputMapper output(buffer, m);
|
||||
|
||||
LhsPacker pack_lhs;
|
||||
|
||||
// compute block sizes (which depend on number of threads)
|
||||
const Index num_threads = this->m_device.numThreads();
|
||||
Index mc = m;
|
||||
@ -190,8 +188,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
const Index k_blocks = CEIL_DIV(k, kc);
|
||||
const Index n_blocks = CEIL_DIV(n, nc);
|
||||
const Index m_blocks = CEIL_DIV(m, mc);
|
||||
const int sizeA = mc * kc;
|
||||
const int sizeB = kc * nc;
|
||||
const Index sizeA = mc * kc;
|
||||
const Index sizeB = kc * nc;
|
||||
|
||||
/* cout << "m: " << m << " n: " << n << " k: " << k << endl;
|
||||
cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl;
|
||||
@ -228,7 +226,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
const Index num_kernel_promises = num_threads * n_blocks;
|
||||
std::vector<Promise> kernel_promises(num_kernel_promises);
|
||||
std::vector<Future> kernel_futures(num_kernel_promises);
|
||||
for (int i = 0; i < kernel_promises.size(); ++i) {
|
||||
for (std::size_t i = 0; i < kernel_promises.size(); ++i) {
|
||||
kernel_promises[i].set_value();
|
||||
kernel_futures[i] = kernel_promises[i].get_future();
|
||||
}
|
||||
@ -239,16 +237,16 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
const Index actual_kc = (std::min)(k_start + kc, k) - k_start;
|
||||
|
||||
for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) {
|
||||
const int num_blocks = (std::min)(m_blocks-m_block_idx, numBlockAs);
|
||||
const Index num_blocks = (std::min)(m_blocks-m_block_idx, numBlockAs);
|
||||
|
||||
for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) {
|
||||
const Index m_start = mt_block_idx * mc;
|
||||
const Index actual_mc = (std::min)(m_start + mc, m) - m_start;
|
||||
eigen_assert(actual_mc > 0);
|
||||
|
||||
int blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
|
||||
Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
|
||||
for (int i = 0; i < n_blocks; ++i) {
|
||||
int future_id = (blockAId * n_blocks + i);
|
||||
Index future_id = (blockAId * n_blocks + i);
|
||||
wait_until_ready(&kernel_futures[future_id]);
|
||||
kernel_promises[future_id] = Promise();
|
||||
kernel_futures[future_id] = kernel_promises[future_id].get_future();
|
||||
@ -277,9 +275,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
// first make sure the previous kernels are all done before overwriting rhs. Also wait if
|
||||
// we're going to start new k. In both cases need_to_pack is true.
|
||||
if (need_to_pack) {
|
||||
for (int i = num_blocks; i < num_threads; ++i) {
|
||||
int blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads;
|
||||
int future_id = (blockAId * n_blocks + n_block_idx);
|
||||
for (Index i = num_blocks; i < num_threads; ++i) {
|
||||
Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads;
|
||||
Index future_id = (blockAId * n_blocks + n_block_idx);
|
||||
wait_until_ready(&kernel_futures[future_id]);
|
||||
}
|
||||
}
|
||||
@ -361,7 +359,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
||||
for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) {
|
||||
const Index m_base_start = arg.m + arg.mc*mt_block_idx;
|
||||
if (m_base_start < arg.max_m) {
|
||||
int blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
|
||||
Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
|
||||
|
||||
wait_until_ready(&(*arg.lhs_futures)[blockAId]);
|
||||
const Index actual_mc = (std::min)(m_base_start + arg.mc, arg.max_m) - m_base_start;
|
||||
|
@ -230,7 +230,7 @@ struct DSizes : array<DenseIndex, NumDims> {
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC DSizes() {
|
||||
for (int i = 0 ; i < NumDims; ++i) {
|
||||
for (std::size_t i = 0 ; i < NumDims; ++i) {
|
||||
(*this)[i] = 0;
|
||||
}
|
||||
}
|
||||
|
@ -97,7 +97,7 @@ struct EvalRange<Evaluator, Index, true> {
|
||||
|
||||
Index i = first;
|
||||
static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
||||
if (last - first > PacketSize) {
|
||||
if (last - first >= PacketSize) {
|
||||
eigen_assert(first % PacketSize == 0);
|
||||
Index lastPacket = last - (last % PacketSize);
|
||||
for (; i < lastPacket; i += PacketSize) {
|
||||
@ -131,7 +131,6 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable>
|
||||
const Index blocksize = std::max<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
|
||||
const Index numblocks = size / blocksize;
|
||||
|
||||
Index i = 0;
|
||||
std::vector<Future> results;
|
||||
results.reserve(numblocks);
|
||||
for (int i = 0; i < numblocks; ++i) {
|
||||
|
@ -28,6 +28,23 @@ namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
namespace {
|
||||
// Note: result is undefined if val == 0
|
||||
template <typename T>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
return __clz(val);
|
||||
#elif EIGEN_COMP_MSVC
|
||||
DWORD leading_zero = 0;
|
||||
_BitScanReverse( &leading_zero, value);
|
||||
return 31 - leading_zero;
|
||||
#else
|
||||
return __builtin_clz(static_cast<uint32_t>(val));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct TensorIntDivisor {
|
||||
public:
|
||||
@ -44,11 +61,7 @@ struct TensorIntDivisor {
|
||||
eigen_assert(divider <= (1<<(N-1)) - 1);
|
||||
|
||||
// fast ln2
|
||||
#ifndef __CUDA_ARCH__
|
||||
const int leading_zeros = __builtin_clz(divider);
|
||||
#else
|
||||
const int leading_zeros = __clz(divider);
|
||||
#endif
|
||||
const int leading_zeros = count_leading_zeros(divider);
|
||||
const int log_div = N - (leading_zeros+1);
|
||||
|
||||
multiplier = (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1;
|
||||
|
@ -85,6 +85,15 @@ class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteA
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const TensorLayoutSwapOp& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorLayoutSwapOp, const TensorLayoutSwapOp> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorLayoutSwapOp& operator = (const OtherDerived& other)
|
||||
|
@ -302,7 +302,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
|
||||
{
|
||||
for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
|
||||
for (std::size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
|
||||
eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
|
||||
}
|
||||
|
||||
|
@ -49,12 +49,9 @@ struct nested<TensorReverseOp<ReverseDimensions, XprType>, 1,
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
|
||||
|
||||
|
||||
template<typename ReverseDimensions, typename XprType>
|
||||
class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
|
||||
XprType>, ReadOnlyAccessors>
|
||||
XprType>, WriteAccessors>
|
||||
{
|
||||
public:
|
||||
typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
|
||||
@ -67,8 +64,8 @@ class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
|
||||
StorageKind;
|
||||
typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(const XprType& expr,
|
||||
const ReverseDimensions& reverse_dims)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(
|
||||
const XprType& expr, const ReverseDimensions& reverse_dims)
|
||||
: m_xpr(expr), m_reverse_dims(reverse_dims) {}
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
@ -78,12 +75,30 @@ class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
|
||||
const typename internal::remove_all<typename XprType::Nested>::type&
|
||||
expression() const { return m_xpr; }
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorReverseOp& operator = (const TensorReverseOp& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorReverseOp, const TensorReverseOp> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<typename OtherDerived>
|
||||
EIGEN_DEVICE_FUNC
|
||||
EIGEN_STRONG_INLINE TensorReverseOp& operator = (const OtherDerived& other)
|
||||
{
|
||||
typedef TensorAssignOp<TensorReverseOp, const OtherDerived> Assign;
|
||||
Assign assign(*this, other);
|
||||
internal::TensorExecutor<const Assign, DefaultDevice, false>::run(assign, DefaultDevice());
|
||||
return *this;
|
||||
}
|
||||
|
||||
protected:
|
||||
typename XprType::Nested m_xpr;
|
||||
const ReverseDimensions m_reverse_dims;
|
||||
};
|
||||
|
||||
|
||||
// Eval as rvalue
|
||||
template<typename ReverseDimensions, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device>
|
||||
@ -134,8 +149,8 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
m_impl.cleanup();
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
|
||||
{
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex(
|
||||
Index index) const {
|
||||
eigen_assert(index < dimensions().TotalSize());
|
||||
Index inputIndex = 0;
|
||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||
@ -152,7 +167,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
} else {
|
||||
inputIndex += index;
|
||||
}
|
||||
return m_impl.coeff(inputIndex);
|
||||
} else {
|
||||
for (int i = 0; i < NumDims - 1; ++i) {
|
||||
Index idx = index / m_strides[i];
|
||||
@ -167,8 +181,13 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
} else {
|
||||
inputIndex += index;
|
||||
}
|
||||
return m_impl.coeff(inputIndex);
|
||||
}
|
||||
return inputIndex;
|
||||
}
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(
|
||||
Index index) const {
|
||||
return m_impl.coeff(reverseIndex(index));
|
||||
}
|
||||
|
||||
template<int LoadMode>
|
||||
@ -199,9 +218,57 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
|
||||
ReverseDimensions m_reverse;
|
||||
};
|
||||
|
||||
// Eval as lvalue
|
||||
|
||||
template <typename ReverseDimensions, typename ArgType, typename Device>
|
||||
struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
|
||||
: public TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>,
|
||||
Device> {
|
||||
typedef TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>,
|
||||
Device> Base;
|
||||
typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
|
||||
typedef typename XprType::Index Index;
|
||||
static const int NumDims = internal::array_size<ReverseDimensions>::value;
|
||||
typedef DSizes<Index, NumDims> Dimensions;
|
||||
|
||||
enum {
|
||||
IsAligned = false,
|
||||
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
|
||||
Layout = TensorEvaluator<ArgType, Device>::Layout,
|
||||
CoordAccess = false, // to be implemented
|
||||
};
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
|
||||
const Device& device)
|
||||
: Base(op, device) {}
|
||||
|
||||
typedef typename XprType::Scalar Scalar;
|
||||
typedef typename XprType::CoeffReturnType CoeffReturnType;
|
||||
typedef typename XprType::PacketReturnType PacketReturnType;
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
const Dimensions& dimensions() const { return this->m_dimensions; }
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
|
||||
return this->m_impl.coeffRef(this->reverseIndex(index));
|
||||
}
|
||||
|
||||
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
void writePacket(Index index, const PacketReturnType& x) {
|
||||
const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
|
||||
EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
|
||||
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
|
||||
|
||||
// This code is pilfered from TensorMorphing.h
|
||||
EIGEN_ALIGN_DEFAULT CoeffReturnType values[packetSize];
|
||||
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
|
||||
for (int i = 0; i < packetSize; ++i) {
|
||||
this->coeffRef(index+i) = values[i];
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
} // end namespace Eigen
|
||||
} // end namespace Eigen
|
||||
|
||||
#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
|
||||
|
@ -94,7 +94,7 @@ static void test_simple_reverse()
|
||||
|
||||
|
||||
template <int DataLayout>
|
||||
static void test_expr_reverse()
|
||||
static void test_expr_reverse(bool LValue)
|
||||
{
|
||||
Tensor<float, 4, DataLayout> tensor(2,3,5,7);
|
||||
tensor.setRandom();
|
||||
@ -105,9 +105,12 @@ static void test_expr_reverse()
|
||||
dim_rev[2] = false;
|
||||
dim_rev[3] = true;
|
||||
|
||||
|
||||
Tensor<float, 4, DataLayout> expected;
|
||||
expected = tensor.reverse(dim_rev);
|
||||
Tensor<float, 4, DataLayout> expected(2, 3, 5, 7);
|
||||
if (LValue) {
|
||||
expected.reverse(dim_rev) = tensor;
|
||||
} else {
|
||||
expected = tensor.reverse(dim_rev);
|
||||
}
|
||||
|
||||
Tensor<float, 4, DataLayout> result(2,3,5,7);
|
||||
|
||||
@ -117,8 +120,13 @@ static void test_expr_reverse()
|
||||
array<ptrdiff_t, 4> dst_slice_start{{0,0,0,0}};
|
||||
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
result.slice(dst_slice_start, dst_slice_dim) =
|
||||
tensor.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
|
||||
if (LValue) {
|
||||
result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev) =
|
||||
tensor.slice(src_slice_start, src_slice_dim);
|
||||
} else {
|
||||
result.slice(dst_slice_start, dst_slice_dim) =
|
||||
tensor.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
|
||||
}
|
||||
src_slice_start[2] += 1;
|
||||
dst_slice_start[2] += 1;
|
||||
}
|
||||
@ -141,8 +149,13 @@ static void test_expr_reverse()
|
||||
dst_slice_start[2] = 0;
|
||||
result.setRandom();
|
||||
for (int i = 0; i < 5; ++i) {
|
||||
result.slice(dst_slice_start, dst_slice_dim) =
|
||||
tensor.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
|
||||
if (LValue) {
|
||||
result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev) =
|
||||
tensor.slice(dst_slice_start, dst_slice_dim);
|
||||
} else {
|
||||
result.slice(dst_slice_start, dst_slice_dim) =
|
||||
tensor.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
|
||||
}
|
||||
dst_slice_start[2] += 1;
|
||||
}
|
||||
|
||||
@ -162,6 +175,8 @@ void test_cxx11_tensor_reverse()
|
||||
{
|
||||
CALL_SUBTEST(test_simple_reverse<ColMajor>());
|
||||
CALL_SUBTEST(test_simple_reverse<RowMajor>());
|
||||
CALL_SUBTEST(test_expr_reverse<ColMajor>());
|
||||
CALL_SUBTEST(test_expr_reverse<RowMajor>());
|
||||
CALL_SUBTEST(test_expr_reverse<ColMajor>(true));
|
||||
CALL_SUBTEST(test_expr_reverse<RowMajor>(true));
|
||||
CALL_SUBTEST(test_expr_reverse<ColMajor>(false));
|
||||
CALL_SUBTEST(test_expr_reverse<RowMajor>(false));
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user