More EIGEN_DEVICE_FUNC fixes for CUDA 10/11/12.

This commit is contained in:
Antonio Sánchez 2023-02-03 19:18:45 +00:00 committed by Rasmus Munk Larsen
parent 2a90653395
commit f6cc359e10
9 changed files with 68 additions and 53 deletions

View File

@ -379,7 +379,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
#if defined(EIGEN_GPU_HAS_LDG)
return __ldg((const float4*)from);
return __ldg(reinterpret_cast<const float4*>(from));
#else
return make_float4(from[0], from[1], from[2], from[3]);
#endif
@ -387,7 +387,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
#if defined(EIGEN_GPU_HAS_LDG)
return __ldg((const double2*)from);
return __ldg(reinterpret_cast<const double2*>(from));
#else
return make_double2(from[0], from[1]);
#endif

View File

@ -121,6 +121,7 @@
// The __device__ annotation seems to actually be needed in some cases,
// otherwise resulting in kernel runtime errors.
EIGEN_NV_DIAG_SUPPRESS(2886)
EIGEN_NV_DIAG_SUPPRESS(2929)
EIGEN_NV_DIAG_SUPPRESS(2977)
EIGEN_NV_DIAG_SUPPRESS(20012)
#undef EIGEN_NV_DIAG_SUPPRESS

View File

@ -456,11 +456,10 @@ EIGEN_DECLARE_TEST(gpu_basic)
// numeric_limits
CALL_SUBTEST( test_with_infs_nans(numeric_limits_test<Vector3f>(), 1, in, out) );
#if defined(__NVCC__)
// FIXME
// These subtests compiles only with nvcc and fail with HIPCC and clang-cuda
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );
typedef Matrix<float,6,6> Matrix6f;
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix6f>(), nthreads, in, out) );
#endif
// These tests require dynamic-sized matrix multiplcation, which isn't currently
// supported on GPU.
// CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );
// typedef Matrix<float,6,6> Matrix6f;
// CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix6f>(), nthreads, in, out) );
}

View File

@ -722,26 +722,26 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
#endif
// Load inputs to shared memory
const int first_x = blockIdx.x * maxX;
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
const int num_x_input = last_x - first_x + kernelSizeX;
const size_t first_x = blockIdx.x * maxX;
const size_t last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
const size_t num_x_input = last_x - first_x + kernelSizeX;
const int first_y = blockIdx.y * maxY;
const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
const int num_y_input = last_y - first_y + kernelSizeY;
const size_t first_y = blockIdx.y * maxY;
const size_t last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
const size_t num_y_input = last_y - first_y + kernelSizeY;
const int first_z = blockIdx.z * maxZ;
const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
const int num_z_input = last_z - first_z + kernelSizeZ;
const size_t first_z = blockIdx.z * maxZ;
const size_t last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
const size_t num_z_input = last_z - first_z + kernelSizeZ;
for (int p = 0; p < numPlanes; ++p) {
const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
const int plane_kernel_offset = 0;
for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
for (size_t k = threadIdx.z; k < num_z_input; k += blockDim.z) {
for (size_t j = threadIdx.y; j < num_y_input; j += blockDim.y) {
for (size_t i = threadIdx.x; i < num_x_input; i += blockDim.x) {
const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
}
@ -751,18 +751,18 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
__syncthreads();
// Convolution
const int num_z_output = last_z - first_z + 1;
const int num_y_output = last_y - first_y + 1;
const int num_x_output = last_x - first_x + 1;
const size_t num_z_output = last_z - first_z + 1;
const size_t num_y_output = last_y - first_y + 1;
const size_t num_x_output = last_x - first_x + 1;
const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
for (size_t k = threadIdx.z; k < num_z_output; k += blockDim.z) {
for (size_t j = threadIdx.y; j < num_y_output; j += blockDim.y) {
for (size_t i = threadIdx.x; i < num_x_output; i += blockDim.x) {
float result = 0.0f;
for (int n = 0; n < kernelSizeZ; ++n) {
for (int m = 0; m < kernelSizeY; ++m) {
for (int l = 0; l < kernelSizeX; ++l) {
for (size_t n = 0; n < kernelSizeZ; ++n) {
for (size_t m = 0; m < kernelSizeY; ++m) {
for (size_t l = 0; l < kernelSizeX; ++l) {
result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
}
}

View File

@ -378,7 +378,7 @@ struct GpuDevice {
return stream_->deviceProperties().maxThreadsPerMultiProcessor;
}
EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
return stream_->deviceProperties().sharedMemPerBlock;
return static_cast<int>(stream_->deviceProperties().sharedMemPerBlock);
}
EIGEN_STRONG_INLINE int majorDeviceVersion() const {
return stream_->deviceProperties().major;

View File

@ -65,7 +65,8 @@ struct TensorEvaluator
TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
TensorEvaluator(const Derived& m, const Device& device)
: m_data(device.get((const_cast<TensorPointerType>(m.data())))),
m_dims(m.dimensions()),
m_device(device)
@ -263,7 +264,8 @@ struct TensorEvaluator<const Derived, Device>
TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
TensorEvaluator(const Derived& m, const Device& device)
: m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
{ }
@ -358,6 +360,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
{
typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
EIGEN_DEVICE_FUNC
TensorEvaluator(const XprType& op, const Device& device)
: m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
{ }
@ -455,6 +458,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
RawAccess = false
};
EIGEN_DEVICE_FUNC
TensorEvaluator(const XprType& op, const Device& device)
: m_device(device),
m_functor(op.functor()),
@ -571,6 +575,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
RawAccess = false
};
EIGEN_DEVICE_FUNC
TensorEvaluator(const XprType& op, const Device& device)
: m_device(device),
m_functor(op.functor()),
@ -709,6 +714,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
RawAccess = false
};
EIGEN_DEVICE_FUNC
TensorEvaluator(const XprType& op, const Device& device)
: m_functor(op.functor()),
m_arg1Impl(op.arg1Expression(), device),
@ -829,6 +835,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
RawAccess = false
};
EIGEN_DEVICE_FUNC
TensorEvaluator(const XprType& op, const Device& device)
: m_condImpl(op.ifExpression(), device),
m_thenImpl(op.thenExpression(), device),

View File

@ -94,9 +94,8 @@ class TensorExecutor {
"You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
"EIGEN_USE_SYCL before including Eigen headers.");
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE void run(const Expression& expr,
const Device& device = Device()) {
const Device& device = DefaultDevice()) {
TensorEvaluator<Expression, Device> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
@ -126,7 +125,6 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
public:
typedef typename Expression::Index StorageIndex;
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE void run(
const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);

View File

@ -52,7 +52,7 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
return;
}
unsigned long long readback;
while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
while ((readback = atomicCAS(reinterpret_cast<unsigned long long*>(output), oldval, newval)) != oldval) {
oldval = readback;
newval = oldval;
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
@ -65,6 +65,9 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
gpu_assert(0 && "Wordsize not supported");
}
#else // EIGEN_CUDA_ARCH >= 300
EIGEN_UNUSED_VARIABLE(output);
EIGEN_UNUSED_VARIABLE(accum);
EIGEN_UNUSED_VARIABLE(reducer);
gpu_assert(0 && "Shouldn't be called on unsupported device");
#endif // EIGEN_CUDA_ARCH >= 300
}
@ -118,6 +121,8 @@ __device__ inline void atomicReduce(float* output, float accum, SumReducer<float
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
atomicAdd(output, accum);
#else // EIGEN_CUDA_ARCH >= 300
EIGEN_UNUSED_VARIABLE(output);
EIGEN_UNUSED_VARIABLE(accum);
gpu_assert(0 && "Shouldn't be called on unsupported device");
#endif // EIGEN_CUDA_ARCH >= 300
}
@ -209,6 +214,11 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer
#endif
}
#else // EIGEN_CUDA_ARCH >= 300
EIGEN_UNUSED_VARIABLE(reducer);
EIGEN_UNUSED_VARIABLE(input);
EIGEN_UNUSED_VARIABLE(num_coeffs);
EIGEN_UNUSED_VARIABLE(output);
EIGEN_UNUSED_VARIABLE(semaphore);
gpu_assert(0 && "Shouldn't be called on unsupported device");
#endif // EIGEN_CUDA_ARCH >= 300
}
@ -243,7 +253,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFlo
template <typename Self,
typename Reducer, typename Index>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self /*input*/, Index num_coeffs, half* output) {
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
const Index num_threads = blockDim.x * gridDim.x;
typedef typename packet_traits<Eigen::half>::type PacketType;
@ -715,11 +725,11 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc
half2* hr2 = reinterpret_cast<half2*>(&r2);
half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
for (int i = 0; i < packet_width / 2; i++) {
hr1[i] =
__shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
hr2[i] =
__shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
for (int j = 0; j < packet_width / 2; j++) {
hr1[j] =
__shfl_down_sync(0xFFFFFFFF, rr1[j], (unsigned)offset, warpSize);
hr2[j] =
__shfl_down_sync(0xFFFFFFFF, rr2[j], (unsigned)offset, warpSize);
}
reducer.reducePacket(r1, &reduced_val1);
reducer.reducePacket(r2, &reduced_val2);
@ -744,7 +754,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc
val = __halves2half2(val1, val2);
if ((threadIdx.x & (warpSize - 1)) == 0) {
half* loc = output + row;
atomicReduce((half2*)loc, val, reducer);
atomicReduce(reinterpret_cast<half2*>(loc), val, reducer);
}
}
}
@ -782,12 +792,12 @@ struct InnerReductionLauncher<
if (num_blocks > 1) {
// We initialize the outputs outside the reduction kernel when we can't be sure that there
// won't be a race conditions between multiple thread blocks.
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
const int max_blocks = device.getNumGpuMultiProcessors() *
const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
const int max_blocks2 = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
num_blocks, 1024, 0, device, reducer.initialize(),
num_blocks2, 1024, 0, device, reducer.initialize(),
num_preserved_vals, output);
}
@ -950,12 +960,12 @@ struct OuterReducer<Self, Op, GpuDevice> {
if (num_blocks > 1) {
// We initialize the outputs in the reduction kernel itself when we don't have to worry
// about race conditions between multiple thread blocks.
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
const int max_blocks = device.getNumGpuMultiProcessors() *
const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
const int max_blocks2 = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
num_blocks, 1024, 0, device, reducer.initialize(),
num_blocks2, 1024, 0, device, reducer.initialize(),
num_preserved_vals, output);
}

View File

@ -191,7 +191,7 @@ template <typename Self, typename Reducer, typename Device,
(TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess &&
internal::reducer_traits<Reducer, Device>::PacketAccess)>
struct ScanLauncher {
void operator()(Self& self, typename Self::CoeffReturnType* data) {
void operator()(Self& self, typename Self::CoeffReturnType* data) const {
Index total_size = internal::array_prod(self.dimensions());
// We fix the index along the scan axis to 0 and perform a