Created the pblend packet primitive and implemented it using SSE and AVX instructions.

This commit is contained in:
Benoit Steiner 2014-06-06 20:18:44 -07:00
parent 79085e08e9
commit 29aebf96e6
5 changed files with 90 additions and 4 deletions

View File

@ -54,6 +54,7 @@ struct default_packet_traits
HasMax = 1,
HasConj = 1,
HasSetLinear = 1,
HasBlend = 0,
HasDiv = 0,
HasSqrt = 0,
@ -429,6 +430,19 @@ ptranspose(PacketBlock<Packet,1>& /*kernel*/) {
// Nothing to do in the scalar case, i.e. a 1x1 matrix.
}
/***************************************************************************
* Selector, i.e. vector of N boolean values used to select (i.e. blend)
* words from 2 packets.
***************************************************************************/
template <size_t N> struct Selector {
bool select[N];
};
template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
return ifPacket.select[0] ? thenPacket : elsePacket;
}
} // end namespace internal
} // end namespace Eigen

View File

@ -59,6 +59,7 @@ template<> struct packet_traits<float> : default_packet_traits
HasLog = 0,
HasExp = 0,
HasSqrt = 0
HasBlend = 1,
};
};
template<> struct packet_traits<double> : default_packet_traits
@ -73,6 +74,7 @@ template<> struct packet_traits<double> : default_packet_traits
HasDiv = 1,
HasExp = 0
HasBlend = 1,
};
};
@ -557,6 +559,19 @@ ptranspose(PacketBlock<Packet4d,4>& kernel) {
kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
}
template<> EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket, const Packet8f& elsePacket) {
const __m256 zero = _mm256_setzero_ps();
const __m256 select = _mm256_set_ps(ifPacket.select[7], ifPacket.select[6], ifPacket.select[5], ifPacket.select[4], ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m256 false_mask = _mm256_cmp_ps(select, zero, _CMP_EQ_UQ);
return _mm256_blendv_ps(thenPacket, elsePacket, false_mask);
}
template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket, const Packet4d& elsePacket) {
const __m256d zero = _mm256_setzero_pd();
const __m256d select = _mm256_set_pd(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m256d false_mask = _mm256_cmp_pd(select, zero, _CMP_EQ_UQ);
return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
}
} // end namespace internal
} // end namespace Eigen

View File

@ -44,7 +44,8 @@ template<> struct packet_traits<std::complex<float> > : default_packet_traits
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasSetLinear = 0
HasSetLinear = 0,
HasBlend = 1
};
};
#endif
@ -472,6 +473,11 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
kernel.packet[1].v = tmp;
}
template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
__m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
return Packet2cf(_mm_castpd_ps(result));
}
} // end namespace internal
} // end namespace Eigen

View File

@ -108,7 +108,8 @@ template<> struct packet_traits<float> : default_packet_traits
HasCos = EIGEN_FAST_MATH,
HasLog = 1,
HasExp = 1,
HasSqrt = 1
HasSqrt = 1,
HasBlend = 1
};
};
template<> struct packet_traits<double> : default_packet_traits
@ -123,7 +124,8 @@ template<> struct packet_traits<double> : default_packet_traits
HasDiv = 1,
HasExp = 1,
HasSqrt = 1
HasSqrt = 1,
HasBlend = 1
};
};
#endif
@ -135,7 +137,9 @@ template<> struct packet_traits<int> : default_packet_traits
// FIXME check the Has*
Vectorizable = 1,
AlignedOnScalar = 1,
size=4
size=4,
HasBlend = 1
};
};
@ -809,6 +813,37 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
}
template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
const __m128i zero = _mm_setzero_si128();
const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m128i false_mask = _mm_cmpeq_epi32(select, zero);
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_blendv_epi8(thenPacket, elsePacket, false_mask);
#else
return _mm_or_si128(_mm_andnot_si128(false_mask, thenPacket), _mm_and_si128(false_mask, elsePacket));
#endif
}
template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
const __m128 zero = _mm_setzero_ps();
const __m128 select = _mm_set_ps(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
__m128 false_mask = _mm_cmpeq_ps(select, zero);
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_blendv_ps(thenPacket, elsePacket, false_mask);
#else
return _mm_or_ps(_mm_andnot_ps(false_mask, thenPacket), _mm_and_ps(false_mask, elsePacket));
#endif
}
template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
const __m128d zero = _mm_setzero_pd();
const __m128d select = _mm_set_pd(ifPacket.select[1], ifPacket.select[0]);
__m128d false_mask = _mm_cmpeq_pd(select, zero);
#ifdef EIGEN_VECTORIZE_SSE4_1
return _mm_blendv_pd(thenPacket, elsePacket, false_mask);
#else
return _mm_or_pd(_mm_andnot_pd(false_mask, thenPacket), _mm_and_pd(false_mask, elsePacket));
#endif
}
} // end namespace internal
} // end namespace Eigen

View File

@ -261,6 +261,22 @@ template<typename Scalar> void packetmath()
VERIFY(isApproxAbs(data2[j], data1[i+j*PacketSize], refvalue) && "ptranspose");
}
}
if (internal::packet_traits<Scalar>::HasBlend) {
Packet thenPacket = internal::pload<Packet>(data1);
Packet elsePacket = internal::pload<Packet>(data2);
EIGEN_ALIGN_DEFAULT internal::Selector<PacketSize> selector;
for (int i = 0; i < PacketSize; ++i) {
selector.select[i] = i;
}
Packet blend = internal::pblend(selector, thenPacket, elsePacket);
EIGEN_ALIGN_DEFAULT Scalar result[size];
internal::pstore(result, blend);
for (int i = 0; i < PacketSize; ++i) {
VERIFY(isApproxAbs(result[i], (selector.select[i] ? data1[i] : data2[i]), refvalue));
}
}
}
template<typename Scalar> void packetmath_real()