mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-27 07:29:52 +08:00
working 64-bit support in PacketMath.h, Complex.h needed
This commit is contained in:
parent
0f65f2762d
commit
b508619392
@ -20,10 +20,18 @@ namespace internal {
|
||||
#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_HAS_FUSED_MADD
|
||||
#define EIGEN_HAS_FUSED_MADD 1
|
||||
#endif
|
||||
|
||||
#ifndef EIGEN_HAS_FUSE_CJMADD
|
||||
#define EIGEN_HAS_FUSE_CJMADD 1
|
||||
#endif
|
||||
|
||||
// FIXME NEON has 16 quad registers, but since the current register allocator
|
||||
// is so bad, it is much better to reduce it to 8
|
||||
#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
|
||||
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
|
||||
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
|
||||
#endif
|
||||
|
||||
typedef float32x4_t Packet4f;
|
||||
@ -71,6 +79,7 @@ template<> struct packet_traits<float> : default_packet_traits
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
size = 4,
|
||||
HasHalfPacket=0,
|
||||
|
||||
HasDiv = 1,
|
||||
// FIXME check the Has*
|
||||
@ -136,6 +145,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
|
||||
{
|
||||
#ifndef __aarch64__
|
||||
Packet4f inv, restep, div;
|
||||
|
||||
// NEON does not offer a divide instruction, we have to do a reciprocal approximation
|
||||
@ -154,7 +164,11 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const
|
||||
div = vmulq_f32(a, inv);
|
||||
|
||||
return div;
|
||||
#else
|
||||
return vdivq_f32(a,b);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
|
||||
{ eigen_assert(false && "packet integer division are not supported by NEON");
|
||||
return pset1<Packet4i>(0);
|
||||
@ -472,6 +486,164 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
|
||||
kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
|
||||
}
|
||||
|
||||
//---------- double ----------
|
||||
#ifdef __aarch64__
|
||||
|
||||
typedef float64x2_t Packet2d;
|
||||
|
||||
template<> struct packet_traits<double> : default_packet_traits
|
||||
{
|
||||
typedef Packet2d type;
|
||||
typedef Packet2d half;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
size = 2,
|
||||
HasHalfPacket=0,
|
||||
|
||||
HasDiv = 1,
|
||||
// FIXME check the Has*
|
||||
HasSin = 0,
|
||||
HasCos = 0,
|
||||
HasLog = 0,
|
||||
HasExp = 0,
|
||||
HasSqrt = 0
|
||||
};
|
||||
};
|
||||
|
||||
template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; typedef Packet2d half; };
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return vdupq_n_f64(from); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a)
|
||||
{
|
||||
Packet2d countdown = EIGEN_INIT_NEON_PACKET2(0, 1);
|
||||
return vaddq_f64(pset1<Packet2d>(a), countdown);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return vnegq_f64(a); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmulq_f64(a,b); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
|
||||
|
||||
// for some weird raisons, it has to be overloaded for packet of integers
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
|
||||
|
||||
// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
|
||||
{
|
||||
return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
|
||||
{
|
||||
return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
|
||||
{
|
||||
return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
|
||||
{
|
||||
return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from)
|
||||
{
|
||||
return vld1q_dup_f64(from);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, DenseIndex stride)
|
||||
{
|
||||
Packet2d res;
|
||||
res = vsetq_lane_f64(from[0*stride], res, 0);
|
||||
res = vsetq_lane_f64(from[1*stride], res, 1);
|
||||
return res;
|
||||
}
|
||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, DenseIndex stride)
|
||||
{
|
||||
to[stride*0] = vgetq_lane_f64(from, 0);
|
||||
to[stride*1] = vgetq_lane_f64(from, 1);
|
||||
}
|
||||
template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
|
||||
|
||||
// FIXME only store the 2 first elements ?
|
||||
template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a, 0); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_low_f64(a) + vget_high_f64(a); }
|
||||
|
||||
template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
|
||||
{
|
||||
float64x2_t trn1, trn2;
|
||||
|
||||
// NEON zip performs interleaving of the supplied vectors.
|
||||
// We perform two interleaves in a row to acquire the transposed vector
|
||||
trn1 = vzip1q_f64(vecs[0], vecs[1]);
|
||||
trn2 = vzip2q_f64(vecs[0], vecs[1]);
|
||||
|
||||
// Do the addition of the resulting vectors
|
||||
return vaddq_f64(trn1, trn2);
|
||||
}
|
||||
// Other reduction functions:
|
||||
// mul
|
||||
template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_low_f64(a) * vget_high_f64(a); }
|
||||
|
||||
// min
|
||||
template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }
|
||||
|
||||
// max
|
||||
template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); }
|
||||
|
||||
// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
|
||||
// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
|
||||
#define PALIGN_NEON(Offset,Type,Command) \
|
||||
template<>\
|
||||
struct palign_impl<Offset,Type>\
|
||||
{\
|
||||
EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
|
||||
{\
|
||||
if (Offset!=0)\
|
||||
first = Command(first, second, Offset);\
|
||||
}\
|
||||
};\
|
||||
|
||||
PALIGN_NEON(0,Packet2d,vextq_f64)
|
||||
PALIGN_NEON(1,Packet2d,vextq_f64)
|
||||
#undef PALIGN_NEON
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void
|
||||
ptranspose(PacketBlock<Packet2d,2>& kernel) {
|
||||
float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
|
||||
float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
|
||||
|
||||
kernel.packet[0] = trn1;
|
||||
kernel.packet[1] = trn2;
|
||||
}
|
||||
#endif // __aarch64__
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
Loading…
Reference in New Issue
Block a user