mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-21 07:19:46 +08:00
Fixing HIP breakage caused by the recent commit that introduces Packet4h2 as the Eigen::Half packet type
This commit is contained in:
parent
d53ae40f7b
commit
7158ed4e0e
@ -706,7 +706,7 @@ struct hash<Eigen::half> {
|
||||
|
||||
// Add the missing shfl_xor intrinsic
|
||||
#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
defined(EIGEN_HIPCC)
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
|
||||
#if (EIGEN_CUDA_SDK_VER < 90000) || \
|
||||
@ -720,7 +720,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM
|
||||
|
||||
// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
|
||||
#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
defined(EIGEN_HIPCC)
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
|
||||
return Eigen::half_impl::raw_uint16_to_half(
|
||||
__ldg(reinterpret_cast<const unsigned short*>(ptr)));
|
||||
|
@ -481,7 +481,7 @@ ptranspose(PacketBlock<double2,2>& kernel) {
|
||||
// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning
|
||||
// its corresponding packet_traits<Eigen::half> must be visible on host.
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC)) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC)) || \
|
||||
(defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__))
|
||||
|
||||
typedef ulonglong2 Packet4h2;
|
||||
@ -515,11 +515,13 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
|
||||
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
|
||||
#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIPCC)
|
||||
half2 r;
|
||||
r.x = from;
|
||||
r.y = from;
|
||||
return r;
|
||||
#elif defined(EIGEN_HIPCC)
|
||||
return __half2{from,from};
|
||||
#else
|
||||
return __half2half2(from);
|
||||
#endif
|
||||
@ -537,7 +539,7 @@ pset1<Packet4h2>(const Eigen::half& from) {
|
||||
return r;
|
||||
}
|
||||
|
||||
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
|
||||
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
|
||||
namespace {
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
|
||||
@ -559,7 +561,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
|
||||
const half2& from) {
|
||||
#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIPCC)
|
||||
to[0] = from.x;
|
||||
to[1] = from.y;
|
||||
#else
|
||||
@ -1056,7 +1058,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
|
||||
#endif
|
||||
} // namespace
|
||||
|
||||
|
||||
template <>
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
|
||||
pload<Packet4h2>(const Eigen::half* from) {
|
||||
|
@ -439,9 +439,6 @@
|
||||
#if defined(EIGEN_HIPCC)
|
||||
#define EIGEN_VECTORIZE_GPU
|
||||
#include <hip/hip_vector_types.h>
|
||||
#endif
|
||||
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
#define EIGEN_HAS_HIP_FP16
|
||||
#include <hip/hip_fp16.h>
|
||||
#endif
|
||||
|
@ -306,11 +306,17 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
|
||||
#pragma unroll
|
||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||
#if defined(EIGEN_HIPCC)
|
||||
// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
|
||||
union { int i; half2 h; } wka_in, wka_out;
|
||||
wka_in.h = accum;
|
||||
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
|
||||
reducer.reducePacket(wka_out.h, &accum);
|
||||
PacketType r1;
|
||||
half2* hr = reinterpret_cast<half2*>(&r1);
|
||||
half2* hacc = reinterpret_cast<half2*>(&accum);
|
||||
for (int i = 0; i < packet_width / 2; i++) {
|
||||
// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
|
||||
union { int i; half2 h; } wka_in, wka_out;
|
||||
wka_in.h = hacc[i];
|
||||
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
|
||||
hr[i] = wka_out.h;
|
||||
}
|
||||
reducer.reducePacket(r1, &accum);
|
||||
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
|
||||
PacketType r1;
|
||||
half2* hr = reinterpret_cast<half2*>(&r1);
|
||||
@ -661,16 +667,26 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
|
||||
#pragma unroll
|
||||
for (int offset = warpSize/2; offset > 0; offset /= 2) {
|
||||
#if defined(EIGEN_HIPCC)
|
||||
// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
|
||||
union { int i; half2 h; } wka_in, wka_out;
|
||||
PacketType r1;
|
||||
PacketType r2;
|
||||
half2* hr1 = reinterpret_cast<half2*>(&r1);
|
||||
half2* hr2 = reinterpret_cast<half2*>(&r2);
|
||||
half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
|
||||
half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
|
||||
for (int i = 0; i < packet_width / 2; i++) {
|
||||
// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
|
||||
union { int i; half2 h; } wka_in1, wka_out1;
|
||||
wka_in1.h = rv1[i];
|
||||
wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
|
||||
hr1[i] = wka_out1.h;
|
||||
|
||||
wka_in.h = reduced_val1;
|
||||
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
|
||||
reducer.reducePacket(wka_out.h, &reduced_val1);
|
||||
|
||||
wka_in.h = reduced_val2;
|
||||
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
|
||||
reducer.reducePacket(wka_out.h, &reduced_val2);
|
||||
union { int i; half2 h; } wka_in2, wka_out2;
|
||||
wka_in2.h = rv2[i];
|
||||
wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
|
||||
hr2[i] = wka_out2.h;
|
||||
}
|
||||
reducer.reducePacket(r1, &reduced_val1);
|
||||
reducer.reducePacket(r2, &reduced_val2);
|
||||
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
|
||||
PacketType r1;
|
||||
PacketType r2;
|
||||
|
Loading…
Reference in New Issue
Block a user