mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-27 07:29:52 +08:00
Add print for SSE/NEON, use NEON rounding intrinsics if available.
In SSE, by adding/subtracting 2^MantissaBits, we force rounding according to the current rounding mode. For NEON, we use the provided intrinsics for rint/floor/ceil if available (armv8). Related to #1969.
This commit is contained in:
parent
976ae0ca6f
commit
1e0c7d4f49
@ -196,6 +196,7 @@ struct packet_traits<float> : default_packet_traits
|
|||||||
HasDiv = 1,
|
HasDiv = 1,
|
||||||
HasFloor = 1,
|
HasFloor = 1,
|
||||||
HasCeil = 1,
|
HasCeil = 1,
|
||||||
|
HasRint = 1,
|
||||||
|
|
||||||
HasSin = EIGEN_FAST_MATH,
|
HasSin = EIGEN_FAST_MATH,
|
||||||
HasCos = EIGEN_FAST_MATH,
|
HasCos = EIGEN_FAST_MATH,
|
||||||
@ -3182,63 +3183,88 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l
|
|||||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b)
|
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b)
|
||||||
{ return vbslq_u64(mask, a, b); }
|
{ return vbslq_u64(mask, a, b); }
|
||||||
|
|
||||||
|
// Use armv8 rounding intinsics if available.
|
||||||
|
#if EIGEN_ARCH_ARMV8
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2f print<Packet2f>(const Packet2f& a)
|
||||||
|
{ return vrndn_f32(a); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a)
|
||||||
|
{ return vrndnq_f32(a); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
|
template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
|
||||||
{
|
{ return vrndm_f32(a); }
|
||||||
const Packet2f cst_1 = pset1<Packet2f>(1.0f);
|
|
||||||
// Round to nearest.
|
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
|
||||||
Packet2f tmp = vcvt_f32_s32(vcvt_s32_f32(a));
|
{ return vrndmq_f32(a); }
|
||||||
// If greater, subtract one.
|
|
||||||
Packet2ui mask = vcgt_f32(tmp, a);
|
template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
|
||||||
mask = vand_u32(mask, vreinterpret_u32_f32(cst_1));
|
{ return vrndp_f32(a); }
|
||||||
tmp = vsub_f32(tmp, vreinterpret_f32_u32(mask));
|
|
||||||
// Handle saturation cases.
|
template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
|
||||||
const Packet2f cst_max = pset1<Packet2f>(static_cast<float>(NumTraits<int32_t>::highest()));
|
{ return vrndpq_f32(a); }
|
||||||
return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
|
||||||
|
// Adds and subtracts signum(a) * 2^23 to force rounding.
|
||||||
|
const Packet4f offset =
|
||||||
|
pselect(pcmp_lt(a, pzero(a)),
|
||||||
|
pset1<Packet4f>(-static_cast<float>(1<<23)),
|
||||||
|
pset1<Packet4f>(+static_cast<float>(1<<23)));
|
||||||
|
return psub(padd(a, offset), offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2f print(const Packet2f& a) {
|
||||||
|
// Adds and subtracts signum(a) * 2^23 to force rounding.
|
||||||
|
const Packet2f offset =
|
||||||
|
pselect(pcmp_lt(a, pzero(a)),
|
||||||
|
pset1<Packet2f>(-static_cast<float>(1<<23)),
|
||||||
|
pset1<Packet2f>(+static_cast<float>(1<<23)));
|
||||||
|
return psub(padd(a, offset), offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
|
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
|
||||||
{
|
{
|
||||||
const Packet4f cst_1 = pset1<Packet4f>(1.0f);
|
const Packet4f cst_1 = pset1<Packet4f>(1.0f);
|
||||||
// Round to nearest.
|
Packet4f tmp = print<Packet4f>(a);
|
||||||
Packet4f tmp = vcvtq_f32_s32(vcvtq_s32_f32(a));
|
|
||||||
// If greater, subtract one.
|
// If greater, subtract one.
|
||||||
Packet4ui mask = vcgtq_f32(tmp, a);
|
Packet4f mask = pcmp_lt(a, tmp);
|
||||||
mask = vandq_u32(mask, vreinterpretq_u32_f32(cst_1));
|
mask = pand(mask, cst_1);
|
||||||
tmp = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
|
return psub(tmp, mask);
|
||||||
// Handle saturation cases.
|
|
||||||
const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest()));
|
|
||||||
return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
|
template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
|
||||||
{
|
{
|
||||||
const Packet2f cst_1 = pset1<Packet2f>(1.0f);
|
const Packet2f cst_1 = pset1<Packet2f>(1.0f);
|
||||||
// Round to nearest.
|
Packet2f tmp = print<Packet2f>(a);
|
||||||
Packet2f tmp = vcvt_f32_s32(vcvt_s32_f32(a));
|
// If greater, subtract one.
|
||||||
// If smaller, add one.
|
Packet2f mask = pcmp_lt(a, tmp);
|
||||||
Packet2ui mask = vclt_f32(tmp, a);
|
mask = pand(mask, cst_1);
|
||||||
mask = vand_u32(mask, vreinterpret_u32_f32(cst_1));
|
return psub(tmp, mask);
|
||||||
tmp = vadd_f32(tmp, vreinterpret_f32_u32(mask));
|
|
||||||
// Handle saturation cases.
|
|
||||||
const Packet2f cst_max = pset1<Packet2f>(static_cast<float>(NumTraits<int32_t>::highest()));
|
|
||||||
return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
|
template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
|
||||||
{
|
{
|
||||||
const Packet4f cst_1 = pset1<Packet4f>(1.0f);
|
const Packet4f cst_1 = pset1<Packet4f>(1.0f);
|
||||||
// Round to nearest.
|
Packet4f tmp = print<Packet4f>(a);
|
||||||
Packet4f tmp = vcvtq_f32_s32(vcvtq_s32_f32(a));
|
|
||||||
// If smaller, add one.
|
// If smaller, add one.
|
||||||
Packet4ui mask = vcltq_f32(tmp, a);
|
Packet4f mask = pcmp_lt(tmp, a);
|
||||||
mask = vandq_u32(mask, vreinterpretq_u32_f32(cst_1));
|
mask = pand(mask, cst_1);
|
||||||
tmp = vaddq_f32(tmp, vreinterpretq_f32_u32(mask));
|
return padd(tmp, mask);
|
||||||
// Handle saturation cases.
|
|
||||||
const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest()));
|
|
||||||
return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a)
|
||||||
|
{
|
||||||
|
const Packet2f cst_1 = pset1<Packet2f>(1.0);
|
||||||
|
Packet2f tmp = print<Packet2f>(a);
|
||||||
|
// If smaller, add one.
|
||||||
|
Packet2f mask = pcmp_lt(tmp, a);
|
||||||
|
mask = pand(mask, cst_1);
|
||||||
|
return padd(tmp, mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Computes the integer square root
|
* Computes the integer square root
|
||||||
* @remarks The calculation is performed using an algorithm which iterates through each binary digit of the result
|
* @remarks The calculation is performed using an algorithm which iterates through each binary digit of the result
|
||||||
@ -3404,6 +3430,7 @@ template<> struct packet_traits<bfloat16> : default_packet_traits
|
|||||||
HasDiv = 1,
|
HasDiv = 1,
|
||||||
HasFloor = 1,
|
HasFloor = 1,
|
||||||
HasCeil = 1,
|
HasCeil = 1,
|
||||||
|
HasRint = 1,
|
||||||
|
|
||||||
HasSin = EIGEN_FAST_MATH,
|
HasSin = EIGEN_FAST_MATH,
|
||||||
HasCos = EIGEN_FAST_MATH,
|
HasCos = EIGEN_FAST_MATH,
|
||||||
@ -3565,6 +3592,11 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4
|
|||||||
return pselect<Packet4us>(mask, a, b);
|
return pselect<Packet4us>(mask, a, b);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a)
|
||||||
|
{
|
||||||
|
return F32ToBf16(print<Packet4f>(Bf16ToF32(a)));
|
||||||
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(const Packet4bf& a)
|
template<> EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(const Packet4bf& a)
|
||||||
{
|
{
|
||||||
return F32ToBf16(pfloor<Packet4f>(Bf16ToF32(a)));
|
return F32ToBf16(pfloor<Packet4f>(Bf16ToF32(a)));
|
||||||
@ -3751,6 +3783,7 @@ template<> struct packet_traits<double> : default_packet_traits
|
|||||||
HasDiv = 1,
|
HasDiv = 1,
|
||||||
HasFloor = 1,
|
HasFloor = 1,
|
||||||
HasCeil = 1,
|
HasCeil = 1,
|
||||||
|
HasRint = 1,
|
||||||
|
|
||||||
HasSin = 0,
|
HasSin = 0,
|
||||||
HasCos = 0,
|
HasCos = 0,
|
||||||
@ -3932,33 +3965,14 @@ ptranspose(PacketBlock<Packet2d, 2>& kernel)
|
|||||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b)
|
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b)
|
||||||
{ return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); }
|
{ return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); }
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
|
||||||
|
{ return vrndnq_f64(a); }
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
|
template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
|
||||||
{
|
{ return vrndmq_f64(a); }
|
||||||
const Packet2d cst_1 = pset1<Packet2d>(1.0);
|
|
||||||
// Round to nearest.
|
|
||||||
Packet2d tmp = vcvtq_f64_s64(vcvtq_s64_f64(a));
|
|
||||||
// If greater, substract 1.
|
|
||||||
uint64x2_t mask = vcgtq_f64(tmp, a);
|
|
||||||
mask = vandq_u64(mask, vreinterpretq_u64_f64(cst_1));
|
|
||||||
tmp = vsubq_f64(tmp, vreinterpretq_f64_u64(mask));
|
|
||||||
// Handle saturation cases.
|
|
||||||
const Packet2d cst_max = pset1<Packet2d>(static_cast<double>(NumTraits<int64_t>::highest()));
|
|
||||||
return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
|
template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
|
||||||
{
|
{ return vrndpq_f64(a); }
|
||||||
const Packet2d cst_1 = pset1<Packet2d>(1.0);
|
|
||||||
// Round to nearest.
|
|
||||||
Packet2d tmp = vcvtq_f64_s64(vcvtq_s64_f64(a));
|
|
||||||
// If smaller, add one.
|
|
||||||
uint64x2_t mask = vcltq_f64(tmp, a);
|
|
||||||
mask = vandq_u64(mask, vreinterpretq_u64_f64(cst_1));
|
|
||||||
tmp = vaddq_f64(tmp, vreinterpretq_f64_u64(mask));
|
|
||||||
// Handle saturation cases.
|
|
||||||
const Packet2d cst_max = pset1<Packet2d>(static_cast<double>(NumTraits<int64_t>::highest()));
|
|
||||||
return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
|
|
||||||
}
|
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent)
|
template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent)
|
||||||
{ return pldexp_generic(a, exponent); }
|
{ return pldexp_generic(a, exponent); }
|
||||||
@ -4020,6 +4034,7 @@ struct packet_traits<Eigen::half> : default_packet_traits {
|
|||||||
HasDiv = 1,
|
HasDiv = 1,
|
||||||
HasFloor = 1,
|
HasFloor = 1,
|
||||||
HasCeil = 1,
|
HasCeil = 1,
|
||||||
|
HasRint = 1,
|
||||||
HasSin = 0,
|
HasSin = 0,
|
||||||
HasCos = 0,
|
HasCos = 0,
|
||||||
HasLog = 0,
|
HasLog = 0,
|
||||||
@ -4230,6 +4245,30 @@ EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan<Packet4hf>(const Packet4hf& a, cons
|
|||||||
return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b)));
|
return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(const Packet8hf& a)
|
||||||
|
{ return vrndnq_f16(a); }
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(const Packet4hf& a)
|
||||||
|
{ return vrndn_f16(a); }
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a)
|
||||||
|
{ return vrndmq_f16(a); }
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a)
|
||||||
|
{ return vrndm_f16(a); }
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a)
|
||||||
|
{ return vrndpq_f16(a); }
|
||||||
|
|
||||||
|
template <>
|
||||||
|
EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a)
|
||||||
|
{ return vrndp_f16(a); }
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(const Packet8hf& a) {
|
EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(const Packet8hf& a) {
|
||||||
return vsqrtq_f16(a);
|
return vsqrtq_f16(a);
|
||||||
@ -4464,62 +4503,6 @@ EIGEN_STRONG_INLINE Packet4hf pabs<Packet4hf>(const Packet4hf& a) {
|
|||||||
return vabs_f16(a);
|
return vabs_f16(a);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <>
|
|
||||||
EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a) {
|
|
||||||
const Packet8hf cst_1 = pset1<Packet8hf>(Eigen::half(1.0f));
|
|
||||||
// Round to nearest.
|
|
||||||
Packet8hf tmp = vcvtq_f16_s16(vcvtq_s16_f16(a));
|
|
||||||
// If greater, substract one.
|
|
||||||
uint16x8_t mask = vcgtq_f16(tmp, a);
|
|
||||||
mask = vandq_u16(mask, vreinterpretq_u16_f16(cst_1));
|
|
||||||
tmp = vsubq_f16(tmp, vreinterpretq_f16_u16(mask));
|
|
||||||
// Handle saturation cases.
|
|
||||||
const Packet8hf cst_max = pset1<Packet8hf>(static_cast<Eigen::half>(NumTraits<int16_t>::highest()));
|
|
||||||
return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
|
||||||
EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a) {
|
|
||||||
const Packet4hf cst_1 = pset1<Packet4hf>(Eigen::half(1.0f));
|
|
||||||
// Round to nearest.
|
|
||||||
Packet4hf tmp = vcvt_f16_s16(vcvt_s16_f16(a));
|
|
||||||
// If greater, substract one.
|
|
||||||
uint16x4_t mask = vcgt_f16(tmp, a);
|
|
||||||
mask = vand_u16(mask, vreinterpret_u16_f16(cst_1));
|
|
||||||
tmp = vsub_f16(tmp, vreinterpret_f16_u16(mask));
|
|
||||||
// Handle saturation cases.
|
|
||||||
const Packet4hf cst_max = pset1<Packet4hf>(static_cast<Eigen::half>(NumTraits<int16_t>::highest()));
|
|
||||||
return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
|
||||||
EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a) {
|
|
||||||
const Packet8hf cst_1 = pset1<Packet8hf>(Eigen::half(1.0f));
|
|
||||||
// Round to nearest.
|
|
||||||
Packet8hf tmp = vcvtq_f16_s16(vcvtq_s16_f16(a));
|
|
||||||
// If smaller, add one.
|
|
||||||
uint16x8_t mask = vcltq_f16(tmp, a);
|
|
||||||
mask = vandq_u16(mask, vreinterpretq_u16_f16(cst_1));
|
|
||||||
tmp = vaddq_f16(tmp, vreinterpretq_f16_u16(mask));
|
|
||||||
// Handle saturation cases.
|
|
||||||
const Packet8hf cst_max = pset1<Packet8hf>(static_cast<Eigen::half>(NumTraits<int16_t>::highest()));
|
|
||||||
return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
|
||||||
EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a) {
|
|
||||||
const Packet4hf cst_1 = pset1<Packet4hf>(Eigen::half(1.0f));
|
|
||||||
// Round to nearest.
|
|
||||||
Packet4hf tmp = vcvt_f16_s16(vcvt_s16_f16(a));
|
|
||||||
// If smaller, add one.
|
|
||||||
uint16x4_t mask = vclt_f16(tmp, a);
|
|
||||||
mask = vand_u16(mask, vreinterpret_u16_f16(cst_1));
|
|
||||||
tmp = vadd_f16(tmp, vreinterpret_f16_u16(mask));
|
|
||||||
// Handle saturation cases.
|
|
||||||
const Packet4hf cst_max = pset1<Packet4hf>(static_cast<Eigen::half>(NumTraits<int16_t>::highest()));
|
|
||||||
return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
EIGEN_STRONG_INLINE Eigen::half predux<Packet8hf>(const Packet8hf& a) {
|
EIGEN_STRONG_INLINE Eigen::half predux<Packet8hf>(const Packet8hf& a) {
|
||||||
float16x4_t a_lo, a_hi, sum;
|
float16x4_t a_lo, a_hi, sum;
|
||||||
|
@ -148,12 +148,11 @@ struct packet_traits<float> : default_packet_traits {
|
|||||||
HasErf = EIGEN_FAST_MATH,
|
HasErf = EIGEN_FAST_MATH,
|
||||||
HasBlend = 1,
|
HasBlend = 1,
|
||||||
HasCeil = 1,
|
HasCeil = 1,
|
||||||
HasFloor = 1
|
HasFloor = 1,
|
||||||
|
HasRint = 1,
|
||||||
|
|
||||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||||
,
|
HasRound = 1,
|
||||||
HasRint = 1,
|
|
||||||
HasRound = 1
|
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
@ -175,12 +174,10 @@ struct packet_traits<double> : default_packet_traits {
|
|||||||
HasRsqrt = 1,
|
HasRsqrt = 1,
|
||||||
HasBlend = 1,
|
HasBlend = 1,
|
||||||
HasFloor = 1,
|
HasFloor = 1,
|
||||||
HasCeil = 1
|
HasCeil = 1,
|
||||||
|
HasRint = 1,
|
||||||
#ifdef EIGEN_VECTORIZE_SSE4_1
|
#ifdef EIGEN_VECTORIZE_SSE4_1
|
||||||
,
|
|
||||||
HasRound = 1,
|
HasRound = 1,
|
||||||
HasRint = 1
|
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
@ -647,23 +644,17 @@ template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { ret
|
|||||||
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
|
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
|
template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
|
||||||
#else
|
#else
|
||||||
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
|
template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) {
|
||||||
{
|
// Adds and subtracts signum(a) * 2^23 to force rounding.
|
||||||
const Packet4f cst_1 = pset1<Packet4f>(1.0f);
|
const Packet4f offset =
|
||||||
Packet4i emm0 = _mm_cvttps_epi32(a);
|
pselect(pcmp_lt(a, pzero(a)),
|
||||||
Packet4f tmp = _mm_cvtepi32_ps(emm0);
|
pset1<Packet4f>(-static_cast<float>(1<<23)),
|
||||||
// If greater, subtract one.
|
pset1<Packet4f>(+static_cast<float>(1<<23)));
|
||||||
Packet4f mask = _mm_cmpgt_ps(tmp, a);
|
return psub(padd(a, offset), offset);
|
||||||
mask = pand(mask, cst_1);
|
|
||||||
tmp = psub(tmp, mask);
|
|
||||||
// Handle saturation cases.
|
|
||||||
const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest()));
|
|
||||||
return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Rounds to nearest integer.
|
template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) {
|
||||||
EIGEN_STRONG_INLINE Packet2d pround_to_nearest(const Packet2d& a) {
|
// Adds and subtracts signum(a) * 2^52 to force rounding.
|
||||||
// Adds and subtracts signum(a) * 2^52 to force rounding to within precision.
|
|
||||||
const Packet2d offset =
|
const Packet2d offset =
|
||||||
pselect(pcmp_lt(a, pzero(a)),
|
pselect(pcmp_lt(a, pzero(a)),
|
||||||
pset1<Packet2d>(-static_cast<double>(1ull<<52)),
|
pset1<Packet2d>(-static_cast<double>(1ull<<52)),
|
||||||
@ -671,10 +662,20 @@ EIGEN_STRONG_INLINE Packet2d pround_to_nearest(const Packet2d& a) {
|
|||||||
return psub(padd(a, offset), offset);
|
return psub(padd(a, offset), offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
|
||||||
|
{
|
||||||
|
const Packet4f cst_1 = pset1<Packet4f>(1.0f);
|
||||||
|
Packet4f tmp = print<Packet4f>(a);
|
||||||
|
// If greater, subtract one.
|
||||||
|
Packet4f mask = _mm_cmpgt_ps(tmp, a);
|
||||||
|
mask = pand(mask, cst_1);
|
||||||
|
return psub(tmp, mask);
|
||||||
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
|
template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
|
||||||
{
|
{
|
||||||
const Packet2d cst_1 = pset1<Packet2d>(1.0);
|
const Packet2d cst_1 = pset1<Packet2d>(1.0);
|
||||||
Packet2d tmp = pround_to_nearest(a);
|
Packet2d tmp = print<Packet2d>(a);
|
||||||
// If greater, subtract one.
|
// If greater, subtract one.
|
||||||
Packet2d mask = _mm_cmpgt_pd(tmp, a);
|
Packet2d mask = _mm_cmpgt_pd(tmp, a);
|
||||||
mask = pand(mask, cst_1);
|
mask = pand(mask, cst_1);
|
||||||
@ -684,21 +685,17 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
|
|||||||
template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
|
template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a)
|
||||||
{
|
{
|
||||||
const Packet4f cst_1 = pset1<Packet4f>(1.0f);
|
const Packet4f cst_1 = pset1<Packet4f>(1.0f);
|
||||||
Packet4i emm0 = _mm_cvttps_epi32(a);
|
Packet4f tmp = print<Packet4f>(a);
|
||||||
Packet4f tmp = _mm_cvtepi32_ps(emm0);
|
|
||||||
// If smaller, add one.
|
// If smaller, add one.
|
||||||
Packet4f mask = _mm_cmplt_ps(tmp, a);
|
Packet4f mask = _mm_cmplt_ps(tmp, a);
|
||||||
mask = pand(mask, cst_1);
|
mask = pand(mask, cst_1);
|
||||||
tmp = padd(tmp, mask);
|
return padd(tmp, mask);
|
||||||
// Handle saturation cases.
|
|
||||||
const Packet4f cst_max = pset1<Packet4f>(static_cast<float>(NumTraits<int32_t>::highest()));
|
|
||||||
return pselect(pcmp_lt(pabs(a), cst_max), tmp, a);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
|
template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a)
|
||||||
{
|
{
|
||||||
const Packet2d cst_1 = pset1<Packet2d>(1.0);
|
const Packet2d cst_1 = pset1<Packet2d>(1.0);
|
||||||
Packet2d tmp = pround_to_nearest(a);
|
Packet2d tmp = print<Packet2d>(a);
|
||||||
// If smaller, add one.
|
// If smaller, add one.
|
||||||
Packet2d mask = _mm_cmplt_pd(tmp, a);
|
Packet2d mask = _mm_cmplt_pd(tmp, a);
|
||||||
mask = pand(mask, cst_1);
|
mask = pand(mask, cst_1);
|
||||||
|
@ -265,6 +265,14 @@
|
|||||||
#define EIGEN_ARCH_ARM_OR_ARM64 0
|
#define EIGEN_ARCH_ARM_OR_ARM64 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/// \internal EIGEN_ARCH_ARMV8 set to 1 if the architecture is armv8 or greater.
|
||||||
|
#if EIGEN_ARCH_ARM_OR_ARM64 && defined(__ARM_ARCH) && __ARM_ARCH >= 8
|
||||||
|
#define EIGEN_ARCH_ARMV8 1
|
||||||
|
#else
|
||||||
|
#define EIGEN_ARCH_ARMV8 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/// \internal EIGEN_HAS_ARM64_FP16 set to 1 if the architecture provides an IEEE
|
/// \internal EIGEN_HAS_ARM64_FP16 set to 1 if the architecture provides an IEEE
|
||||||
/// compliant Arm fp16 type
|
/// compliant Arm fp16 type
|
||||||
#if EIGEN_ARCH_ARM64
|
#if EIGEN_ARCH_ARM64
|
||||||
|
@ -577,6 +577,9 @@ void packetmath_real() {
|
|||||||
values.push_back(Scalar(-1.5) + val); // Bug 1785.
|
values.push_back(Scalar(-1.5) + val); // Bug 1785.
|
||||||
val = val / Scalar(2);
|
val = val / Scalar(2);
|
||||||
}
|
}
|
||||||
|
values.push_back(NumTraits<Scalar>::infinity());
|
||||||
|
values.push_back(-NumTraits<Scalar>::infinity());
|
||||||
|
values.push_back(NumTraits<Scalar>::quiet_NaN());
|
||||||
|
|
||||||
for (size_t k=0; k<values.size(); ++k) {
|
for (size_t k=0; k<values.size(); ++k) {
|
||||||
data1[0] = values[k];
|
data1[0] = values[k];
|
||||||
|
Loading…
Reference in New Issue
Block a user