Clean up #ifs in GPU PacketPath.

Removed redundant checks and redundant code for CUDA/HIP.

Note: there are several issues here of calling `__device__` functions
from `__host__ __device__` functions, in particular `__low2half`.
We do not address that here -- only modifying this file enough
to get our current tests to compile.

Fixed: #1847
This commit is contained in:
Antonio Sanchez 2020-12-04 15:33:19 -08:00
parent f9fac1d5b0
commit 5ec4907434

View File

@ -480,9 +480,7 @@ ptranspose(PacketBlock<double2,2>& kernel) {
// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning
// its corresponding packet_traits<Eigen::half> must be visible on host.
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC)) || \
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC)) || \
(defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__))
#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
typedef ulonglong2 Packet4h2;
template<> struct unpacket_traits<Packet4h2> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; };
@ -515,13 +513,8 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIPCC)
half2 r;
r.x = from;
r.y = from;
return r;
#elif defined(EIGEN_HIPCC)
return __half2{from,from};
#if defined(EIGEN_HIPCC)
return half2half2(from);
#else
return __half2half2(from);
#endif
@ -561,50 +554,33 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
const half2& from) {
#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIPCC)
to[0] = from.x;
to[1] = from.y;
#else
#if defined(EIGEN_GPU_COMPILE_PHASE)
to[0] = __low2half(from);
to[1] = __high2half(from);
#else
// Unfortunately __low2half and __high2half are only __device__ functions.
to[0] = __float2half(__low2float(from));
to[1] = __float2half(__high2float(from));
#endif
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
const Eigen::half* from) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 350
return __ldg((const half2*)from);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 350
return __ldg((const half2*)from);
#else
return __halves2half2(*(from+0), *(from+1));
#endif
#endif
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
const Eigen::half* from) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 350
return __halves2half2(__ldg(from+0), __ldg(from+1));
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 350
return __halves2half2(__ldg(from+0), __ldg(from+1));
#else
return __halves2half2(*(from+0), *(from+1));
#endif
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from,
@ -651,20 +627,12 @@ ptranspose(PacketBlock<half2,2>& kernel) {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
return __halves2half2(a, __hadd(a, __float2half(1.0f)));
#else
float f = __half2float(a) + 1.0f;
return __halves2half2(a, __float2half(f));
#endif
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask,
@ -749,13 +717,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a,
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hadd2(a, b);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
return __hadd2(a, b);
#else
float a1 = __low2float(a);
@ -766,19 +728,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
float r2 = a2 + b2;
return __floats2half2_rn(r1, r2);
#endif
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a,
const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hsub2(a, b);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
return __hsub2(a, b);
#else
float a1 = __low2float(a);
@ -789,39 +743,23 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a,
float r2 = a2 - b2;
return __floats2half2_rn(r1, r2);
#endif
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hneg2(a);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
return __hneg2(a);
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
return __floats2half2_rn(-a1, -a2);
#endif
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a,
const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hmul2(a, b);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
return __hmul2(a, b);
#else
float a1 = __low2float(a);
@ -832,20 +770,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a,
float r2 = a2 * b2;
return __floats2half2_rn(r1, r2);
#endif
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a,
const half2& b,
const half2& c) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hfma2(a, b, c);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
return __hfma2(a, b, c);
#else
float a1 = __low2float(a);
@ -858,18 +788,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a,
float r2 = a2 * b2 + c2;
return __floats2half2_rn(r1, r2);
#endif
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a,
const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __h2div(a, b);
#else // EIGEN_CUDA_ARCH
float a1 = __low2float(a);
float a2 = __high2float(a);
float b1 = __low2float(b);
@ -877,7 +802,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a,
float r1 = a1 / b1;
float r2 = a2 / b2;
return __floats2half2_rn(r1, r2);
#endif
}
@ -904,33 +828,17 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hadd(__low2half(a), __high2half(a));
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
return __hadd(__low2half(a), __high2half(a));
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
return Eigen::half(__float2half(a1 + a2));
#endif
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
__half first = __low2half(a);
__half second = __high2half(a);
return __hgt(first, second) ? first : second;
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
__half first = __low2half(a);
__half second = __high2half(a);
return __hgt(first, second) ? first : second;
@ -939,20 +847,10 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
float a2 = __high2float(a);
return a1 > a2 ? __low2half(a) : __high2half(a);
#endif
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
__half first = __low2half(a);
__half second = __high2half(a);
return __hlt(first, second) ? first : second;
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
__half first = __low2half(a);
__half second = __high2half(a);
return __hlt(first, second) ? first : second;
@ -961,26 +859,16 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
float a2 = __high2float(a);
return a1 < a2 ? __low2half(a) : __high2half(a);
#endif
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hmul(__low2half(a), __high2half(a));
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
return __hmul(__low2half(a), __high2half(a));
#else
float a1 = __low2float(a);
float a2 = __high2float(a);
return Eigen::half(__float2half(a1 * a2));
#endif
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
@ -1108,14 +996,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
Packet4h2 r;
r = __ldg((const Packet4h2*)from);
return r;
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 350
#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 350
Packet4h2 r;
r = __ldg((const Packet4h2*)from);
return r;
@ -1128,8 +1009,6 @@ ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
r_alias[3] = ploadt_ro_aligned(from + 6);
return r;
#endif
#endif
}
template <>
@ -1294,7 +1173,7 @@ ptranspose(PacketBlock<Packet4h2,8>& kernel) {
ptranspose_half(f_row0[1], f_row1[1]);
ptranspose_half(f_row2[0], f_row3[0]);
ptranspose_half(f_row2[1], f_row3[1]);
}
template <>
@ -1685,13 +1564,7 @@ prsqrt<Packet4h2>(const Packet4h2& a) {
template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a,
const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hadd2(a, b);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
return __hadd2(a, b);
#else
float a1 = __low2float(a);
@ -1702,20 +1575,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a,
float r2 = a2 + b2;
return __floats2half2_rn(r1, r2);
#endif
#endif
}
template<>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a,
const half2& b) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return __hmul2(a, b);
#else // EIGEN_CUDA_ARCH
#if EIGEN_CUDA_ARCH >= 530
#if defined(EIGEN_HIP_DEVICE_COMPILE) || EIGEN_CUDA_ARCH >= 530
return __hmul2(a, b);
#else
float a1 = __low2float(a);
@ -1726,8 +1591,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a,
float r2 = a2 * b2;
return __floats2half2_rn(r1, r2);
#endif
#endif
}
template<>