mirror of
https://gitlab.com/libeigen/eigen.git
synced 2025-03-07 18:27:40 +08:00
More EIGEN_DEVICE_FUNC fixes for CUDA 10/11/12.
This commit is contained in:
parent
2a90653395
commit
f6cc359e10
@ -379,7 +379,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
|
||||
#if defined(EIGEN_GPU_HAS_LDG)
|
||||
return __ldg((const float4*)from);
|
||||
return __ldg(reinterpret_cast<const float4*>(from));
|
||||
#else
|
||||
return make_float4(from[0], from[1], from[2], from[3]);
|
||||
#endif
|
||||
@ -387,7 +387,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
|
||||
#if defined(EIGEN_GPU_HAS_LDG)
|
||||
return __ldg((const double2*)from);
|
||||
return __ldg(reinterpret_cast<const double2*>(from));
|
||||
#else
|
||||
return make_double2(from[0], from[1]);
|
||||
#endif
|
||||
|
@ -121,6 +121,7 @@
|
||||
// The __device__ annotation seems to actually be needed in some cases,
|
||||
// otherwise resulting in kernel runtime errors.
|
||||
EIGEN_NV_DIAG_SUPPRESS(2886)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2929)
|
||||
EIGEN_NV_DIAG_SUPPRESS(2977)
|
||||
EIGEN_NV_DIAG_SUPPRESS(20012)
|
||||
#undef EIGEN_NV_DIAG_SUPPRESS
|
||||
|
@ -456,11 +456,10 @@ EIGEN_DECLARE_TEST(gpu_basic)
|
||||
// numeric_limits
|
||||
CALL_SUBTEST( test_with_infs_nans(numeric_limits_test<Vector3f>(), 1, in, out) );
|
||||
|
||||
#if defined(__NVCC__)
|
||||
// FIXME
|
||||
// These subtests compiles only with nvcc and fail with HIPCC and clang-cuda
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );
|
||||
typedef Matrix<float,6,6> Matrix6f;
|
||||
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix6f>(), nthreads, in, out) );
|
||||
#endif
|
||||
// These tests require dynamic-sized matrix multiplcation, which isn't currently
|
||||
// supported on GPU.
|
||||
|
||||
// CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );
|
||||
// typedef Matrix<float,6,6> Matrix6f;
|
||||
// CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix6f>(), nthreads, in, out) );
|
||||
}
|
||||
|
@ -722,26 +722,26 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
|
||||
#endif
|
||||
|
||||
// Load inputs to shared memory
|
||||
const int first_x = blockIdx.x * maxX;
|
||||
const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
|
||||
const int num_x_input = last_x - first_x + kernelSizeX;
|
||||
const size_t first_x = blockIdx.x * maxX;
|
||||
const size_t last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
|
||||
const size_t num_x_input = last_x - first_x + kernelSizeX;
|
||||
|
||||
const int first_y = blockIdx.y * maxY;
|
||||
const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
|
||||
const int num_y_input = last_y - first_y + kernelSizeY;
|
||||
const size_t first_y = blockIdx.y * maxY;
|
||||
const size_t last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
|
||||
const size_t num_y_input = last_y - first_y + kernelSizeY;
|
||||
|
||||
const int first_z = blockIdx.z * maxZ;
|
||||
const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
|
||||
const int num_z_input = last_z - first_z + kernelSizeZ;
|
||||
const size_t first_z = blockIdx.z * maxZ;
|
||||
const size_t last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
|
||||
const size_t num_z_input = last_z - first_z + kernelSizeZ;
|
||||
|
||||
for (int p = 0; p < numPlanes; ++p) {
|
||||
|
||||
const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
|
||||
const int plane_kernel_offset = 0;
|
||||
|
||||
for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
|
||||
for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
|
||||
for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
|
||||
for (size_t k = threadIdx.z; k < num_z_input; k += blockDim.z) {
|
||||
for (size_t j = threadIdx.y; j < num_y_input; j += blockDim.y) {
|
||||
for (size_t i = threadIdx.x; i < num_x_input; i += blockDim.x) {
|
||||
const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
|
||||
s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
|
||||
}
|
||||
@ -751,18 +751,18 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
|
||||
__syncthreads();
|
||||
|
||||
// Convolution
|
||||
const int num_z_output = last_z - first_z + 1;
|
||||
const int num_y_output = last_y - first_y + 1;
|
||||
const int num_x_output = last_x - first_x + 1;
|
||||
const size_t num_z_output = last_z - first_z + 1;
|
||||
const size_t num_y_output = last_y - first_y + 1;
|
||||
const size_t num_x_output = last_x - first_x + 1;
|
||||
const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
|
||||
|
||||
for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
|
||||
for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
|
||||
for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
|
||||
for (size_t k = threadIdx.z; k < num_z_output; k += blockDim.z) {
|
||||
for (size_t j = threadIdx.y; j < num_y_output; j += blockDim.y) {
|
||||
for (size_t i = threadIdx.x; i < num_x_output; i += blockDim.x) {
|
||||
float result = 0.0f;
|
||||
for (int n = 0; n < kernelSizeZ; ++n) {
|
||||
for (int m = 0; m < kernelSizeY; ++m) {
|
||||
for (int l = 0; l < kernelSizeX; ++l) {
|
||||
for (size_t n = 0; n < kernelSizeZ; ++n) {
|
||||
for (size_t m = 0; m < kernelSizeY; ++m) {
|
||||
for (size_t l = 0; l < kernelSizeX; ++l) {
|
||||
result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
|
||||
}
|
||||
}
|
||||
|
@ -378,7 +378,7 @@ struct GpuDevice {
|
||||
return stream_->deviceProperties().maxThreadsPerMultiProcessor;
|
||||
}
|
||||
EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
|
||||
return stream_->deviceProperties().sharedMemPerBlock;
|
||||
return static_cast<int>(stream_->deviceProperties().sharedMemPerBlock);
|
||||
}
|
||||
EIGEN_STRONG_INLINE int majorDeviceVersion() const {
|
||||
return stream_->deviceProperties().major;
|
||||
|
@ -65,7 +65,8 @@ struct TensorEvaluator
|
||||
TensorBlock;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
TensorEvaluator(const Derived& m, const Device& device)
|
||||
: m_data(device.get((const_cast<TensorPointerType>(m.data())))),
|
||||
m_dims(m.dimensions()),
|
||||
m_device(device)
|
||||
@ -263,7 +264,8 @@ struct TensorEvaluator<const Derived, Device>
|
||||
TensorBlock;
|
||||
//===--------------------------------------------------------------------===//
|
||||
|
||||
EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
|
||||
TensorEvaluator(const Derived& m, const Device& device)
|
||||
: m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
|
||||
{ }
|
||||
|
||||
@ -358,6 +360,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
|
||||
{
|
||||
typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
|
||||
{ }
|
||||
@ -455,6 +458,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_device(device),
|
||||
m_functor(op.functor()),
|
||||
@ -571,6 +575,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_device(device),
|
||||
m_functor(op.functor()),
|
||||
@ -709,6 +714,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_functor(op.functor()),
|
||||
m_arg1Impl(op.arg1Expression(), device),
|
||||
@ -829,6 +835,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
|
||||
RawAccess = false
|
||||
};
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
TensorEvaluator(const XprType& op, const Device& device)
|
||||
: m_condImpl(op.ifExpression(), device),
|
||||
m_thenImpl(op.thenExpression(), device),
|
||||
|
@ -94,9 +94,8 @@ class TensorExecutor {
|
||||
"You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
|
||||
"EIGEN_USE_SYCL before including Eigen headers.");
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE void run(const Expression& expr,
|
||||
const Device& device = Device()) {
|
||||
const Device& device = DefaultDevice()) {
|
||||
TensorEvaluator<Expression, Device> evaluator(expr, device);
|
||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||
if (needs_assign) {
|
||||
@ -126,7 +125,6 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
|
||||
public:
|
||||
typedef typename Expression::Index StorageIndex;
|
||||
|
||||
EIGEN_DEVICE_FUNC
|
||||
static EIGEN_STRONG_INLINE void run(
|
||||
const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
|
||||
TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
|
||||
|
@ -52,7 +52,7 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
|
||||
return;
|
||||
}
|
||||
unsigned long long readback;
|
||||
while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
|
||||
while ((readback = atomicCAS(reinterpret_cast<unsigned long long*>(output), oldval, newval)) != oldval) {
|
||||
oldval = readback;
|
||||
newval = oldval;
|
||||
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
|
||||
@ -65,6 +65,9 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
|
||||
gpu_assert(0 && "Wordsize not supported");
|
||||
}
|
||||
#else // EIGEN_CUDA_ARCH >= 300
|
||||
EIGEN_UNUSED_VARIABLE(output);
|
||||
EIGEN_UNUSED_VARIABLE(accum);
|
||||
EIGEN_UNUSED_VARIABLE(reducer);
|
||||
gpu_assert(0 && "Shouldn't be called on unsupported device");
|
||||
#endif // EIGEN_CUDA_ARCH >= 300
|
||||
}
|
||||
@ -118,6 +121,8 @@ __device__ inline void atomicReduce(float* output, float accum, SumReducer<float
|
||||
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
|
||||
atomicAdd(output, accum);
|
||||
#else // EIGEN_CUDA_ARCH >= 300
|
||||
EIGEN_UNUSED_VARIABLE(output);
|
||||
EIGEN_UNUSED_VARIABLE(accum);
|
||||
gpu_assert(0 && "Shouldn't be called on unsupported device");
|
||||
#endif // EIGEN_CUDA_ARCH >= 300
|
||||
}
|
||||
@ -209,6 +214,11 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer
|
||||
#endif
|
||||
}
|
||||
#else // EIGEN_CUDA_ARCH >= 300
|
||||
EIGEN_UNUSED_VARIABLE(reducer);
|
||||
EIGEN_UNUSED_VARIABLE(input);
|
||||
EIGEN_UNUSED_VARIABLE(num_coeffs);
|
||||
EIGEN_UNUSED_VARIABLE(output);
|
||||
EIGEN_UNUSED_VARIABLE(semaphore);
|
||||
gpu_assert(0 && "Shouldn't be called on unsupported device");
|
||||
#endif // EIGEN_CUDA_ARCH >= 300
|
||||
}
|
||||
@ -243,7 +253,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFlo
|
||||
|
||||
template <typename Self,
|
||||
typename Reducer, typename Index>
|
||||
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
|
||||
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self /*input*/, Index num_coeffs, half* output) {
|
||||
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const Index num_threads = blockDim.x * gridDim.x;
|
||||
typedef typename packet_traits<Eigen::half>::type PacketType;
|
||||
@ -715,11 +725,11 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc
|
||||
half2* hr2 = reinterpret_cast<half2*>(&r2);
|
||||
half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
|
||||
half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
|
||||
for (int i = 0; i < packet_width / 2; i++) {
|
||||
hr1[i] =
|
||||
__shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
|
||||
hr2[i] =
|
||||
__shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
|
||||
for (int j = 0; j < packet_width / 2; j++) {
|
||||
hr1[j] =
|
||||
__shfl_down_sync(0xFFFFFFFF, rr1[j], (unsigned)offset, warpSize);
|
||||
hr2[j] =
|
||||
__shfl_down_sync(0xFFFFFFFF, rr2[j], (unsigned)offset, warpSize);
|
||||
}
|
||||
reducer.reducePacket(r1, &reduced_val1);
|
||||
reducer.reducePacket(r2, &reduced_val2);
|
||||
@ -744,7 +754,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc
|
||||
val = __halves2half2(val1, val2);
|
||||
if ((threadIdx.x & (warpSize - 1)) == 0) {
|
||||
half* loc = output + row;
|
||||
atomicReduce((half2*)loc, val, reducer);
|
||||
atomicReduce(reinterpret_cast<half2*>(loc), val, reducer);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -782,12 +792,12 @@ struct InnerReductionLauncher<
|
||||
if (num_blocks > 1) {
|
||||
// We initialize the outputs outside the reduction kernel when we can't be sure that there
|
||||
// won't be a race conditions between multiple thread blocks.
|
||||
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
|
||||
const int max_blocks = device.getNumGpuMultiProcessors() *
|
||||
const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
|
||||
const int max_blocks2 = device.getNumGpuMultiProcessors() *
|
||||
device.maxGpuThreadsPerMultiProcessor() / 1024;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
|
||||
LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
|
||||
num_blocks, 1024, 0, device, reducer.initialize(),
|
||||
num_blocks2, 1024, 0, device, reducer.initialize(),
|
||||
num_preserved_vals, output);
|
||||
}
|
||||
|
||||
@ -950,12 +960,12 @@ struct OuterReducer<Self, Op, GpuDevice> {
|
||||
if (num_blocks > 1) {
|
||||
// We initialize the outputs in the reduction kernel itself when we don't have to worry
|
||||
// about race conditions between multiple thread blocks.
|
||||
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
|
||||
const int max_blocks = device.getNumGpuMultiProcessors() *
|
||||
const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
|
||||
const int max_blocks2 = device.getNumGpuMultiProcessors() *
|
||||
device.maxGpuThreadsPerMultiProcessor() / 1024;
|
||||
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
|
||||
const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
|
||||
LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
|
||||
num_blocks, 1024, 0, device, reducer.initialize(),
|
||||
num_blocks2, 1024, 0, device, reducer.initialize(),
|
||||
num_preserved_vals, output);
|
||||
}
|
||||
|
||||
|
@ -191,7 +191,7 @@ template <typename Self, typename Reducer, typename Device,
|
||||
(TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess &&
|
||||
internal::reducer_traits<Reducer, Device>::PacketAccess)>
|
||||
struct ScanLauncher {
|
||||
void operator()(Self& self, typename Self::CoeffReturnType* data) {
|
||||
void operator()(Self& self, typename Self::CoeffReturnType* data) const {
|
||||
Index total_size = internal::array_prod(self.dimensions());
|
||||
|
||||
// We fix the index along the scan axis to 0 and perform a
|
||||
|
Loading…
Reference in New Issue
Block a user