More EIGEN_DEVICE_FUNC fixes for CUDA 10/11/12.

2025-03-07 18:27:40 +08:00 · 2023-02-03 19:18:45 +00:00 · 2023-02-03 19:18:45 +00:00 · f6cc359e10
commit f6cc359e10
parent 2a90653395
9 changed files with 68 additions and 53 deletions
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@ -379,7 +379,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
 #if defined(EIGEN_GPU_HAS_LDG)
-  return __ldg((const float4*)from);
+  return __ldg(reinterpret_cast<const float4*>(from));
 #else
  return make_float4(from[0], from[1], from[2], from[3]);
 #endif
@ -387,7 +387,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const fl
 template<>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
 #if defined(EIGEN_GPU_HAS_LDG)
-  return __ldg((const double2*)from);
+  return __ldg(reinterpret_cast<const double2*>(from));
 #else
  return make_double2(from[0], from[1]);
 #endif
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@ -121,6 +121,7 @@
  // The __device__ annotation seems to actually be needed in some cases,
  // otherwise resulting in kernel runtime errors.
  EIGEN_NV_DIAG_SUPPRESS(2886)
+  EIGEN_NV_DIAG_SUPPRESS(2929)
  EIGEN_NV_DIAG_SUPPRESS(2977)
  EIGEN_NV_DIAG_SUPPRESS(20012)
  #undef EIGEN_NV_DIAG_SUPPRESS
--- a/test/gpu_basic.cu
+++ b/test/gpu_basic.cu
@ -456,11 +456,10 @@ EIGEN_DECLARE_TEST(gpu_basic)
  // numeric_limits
  CALL_SUBTEST( test_with_infs_nans(numeric_limits_test<Vector3f>(), 1, in, out) );

-#if defined(__NVCC__)
-  // FIXME
-  // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda
-  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );
-  typedef Matrix<float,6,6> Matrix6f;
-  CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix6f>(), nthreads, in, out) );
-#endif
+  // These tests require dynamic-sized matrix multiplcation, which isn't currently
+  // supported on GPU.
+  
+  // CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );
+  // typedef Matrix<float,6,6> Matrix6f;
+  // CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix6f>(), nthreads, in, out) );
 }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@ -722,26 +722,26 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
 #endif

  // Load inputs to shared memory
-  const int first_x = blockIdx.x * maxX;
-  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
-  const int num_x_input = last_x - first_x + kernelSizeX;
+  const size_t first_x = blockIdx.x * maxX;
+  const size_t last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const size_t num_x_input = last_x - first_x + kernelSizeX;

-  const int first_y = blockIdx.y * maxY;
-  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
-  const int num_y_input = last_y - first_y + kernelSizeY;
+  const size_t first_y = blockIdx.y * maxY;
+  const size_t last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+  const size_t num_y_input = last_y - first_y + kernelSizeY;

-  const int first_z = blockIdx.z * maxZ;
-  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
-  const int num_z_input = last_z - first_z + kernelSizeZ;
+  const size_t first_z = blockIdx.z * maxZ;
+  const size_t last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
+  const size_t num_z_input = last_z - first_z + kernelSizeZ;

  for (int p = 0; p < numPlanes; ++p) {

    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
    const int plane_kernel_offset = 0;

-    for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
-      for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
-        for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+    for (size_t k = threadIdx.z; k < num_z_input; k += blockDim.z) {
+      for (size_t j = threadIdx.y; j < num_y_input; j += blockDim.y) {
+        for (size_t i = threadIdx.x; i < num_x_input; i += blockDim.x) {
          const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
          s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
        }
@ -751,18 +751,18 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
    __syncthreads();

    // Convolution
-    const int num_z_output = last_z - first_z + 1;
-    const int num_y_output = last_y - first_y + 1;
-    const int num_x_output = last_x - first_x + 1;
+    const size_t num_z_output = last_z - first_z + 1;
+    const size_t num_y_output = last_y - first_y + 1;
+    const size_t num_x_output = last_x - first_x + 1;
    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);

-    for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
-      for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
-        for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+    for (size_t k = threadIdx.z; k < num_z_output; k += blockDim.z) {
+      for (size_t j = threadIdx.y; j < num_y_output; j += blockDim.y) {
+        for (size_t i = threadIdx.x; i < num_x_output; i += blockDim.x) {
          float result = 0.0f;
-          for (int n = 0; n < kernelSizeZ; ++n) {
-            for (int m = 0; m < kernelSizeY; ++m) {
-              for (int l = 0; l < kernelSizeX; ++l) {
+          for (size_t n = 0; n < kernelSizeZ; ++n) {
+            for (size_t m = 0; m < kernelSizeY; ++m) {
+              for (size_t l = 0; l < kernelSizeX; ++l) {
                result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
              }
            }
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@ -378,7 +378,7 @@ struct GpuDevice {
    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
  }
  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
-    return stream_->deviceProperties().sharedMemPerBlock;
+    return static_cast<int>(stream_->deviceProperties().sharedMemPerBlock);
  }
  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
    return stream_->deviceProperties().major;
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@ -65,7 +65,8 @@ struct TensorEvaluator
      TensorBlock;
  //===--------------------------------------------------------------------===//

-  EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  TensorEvaluator(const Derived& m, const Device& device)
      : m_data(device.get((const_cast<TensorPointerType>(m.data())))),
        m_dims(m.dimensions()),
        m_device(device)
@ -263,7 +264,8 @@ struct TensorEvaluator<const Derived, Device>
      TensorBlock;
  //===--------------------------------------------------------------------===//

-  EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+  TensorEvaluator(const Derived& m, const Device& device)
      : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
  { }

@ -358,6 +360,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
 {
  typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;

+  EIGEN_DEVICE_FUNC
  TensorEvaluator(const XprType& op, const Device& device)
      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
  { }
@ -455,6 +458,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
    RawAccess          = false
  };

+  EIGEN_DEVICE_FUNC
  TensorEvaluator(const XprType& op, const Device& device)
    : m_device(device),
      m_functor(op.functor()),
@ -571,6 +575,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
    RawAccess         = false
  };

+  EIGEN_DEVICE_FUNC
  TensorEvaluator(const XprType& op, const Device& device)
    : m_device(device),
      m_functor(op.functor()),
@ -709,6 +714,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
    RawAccess         = false
  };

+  EIGEN_DEVICE_FUNC
  TensorEvaluator(const XprType& op, const Device& device)
    : m_functor(op.functor()),
      m_arg1Impl(op.arg1Expression(), device),
@ -829,6 +835,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
    RawAccess         = false
  };

+  EIGEN_DEVICE_FUNC
  TensorEvaluator(const XprType& op, const Device& device)
    : m_condImpl(op.ifExpression(), device),
      m_thenImpl(op.thenExpression(), device),
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@ -94,9 +94,8 @@ class TensorExecutor {
                "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
                "EIGEN_USE_SYCL before including Eigen headers.");

-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                                      const Device& device = Device()) {
+                                      const Device& device = DefaultDevice()) {
    TensorEvaluator<Expression, Device> evaluator(expr, device);
    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
    if (needs_assign) {
@ -126,7 +125,6 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
 public:
  typedef typename Expression::Index StorageIndex;

-  EIGEN_DEVICE_FUNC
  static EIGEN_STRONG_INLINE void run(
      const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
    TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@ -52,7 +52,7 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
      return;
    }
    unsigned long long readback;
-    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
+    while ((readback = atomicCAS(reinterpret_cast<unsigned long long*>(output), oldval, newval)) != oldval) {
      oldval = readback;
      newval = oldval;
      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
@ -65,6 +65,9 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
    gpu_assert(0 && "Wordsize not supported");
  }
 #else // EIGEN_CUDA_ARCH >= 300
+  EIGEN_UNUSED_VARIABLE(output);
+  EIGEN_UNUSED_VARIABLE(accum);
+  EIGEN_UNUSED_VARIABLE(reducer);
  gpu_assert(0 && "Shouldn't be called on unsupported device");
 #endif // EIGEN_CUDA_ARCH >= 300
 }
@ -118,6 +121,8 @@ __device__ inline void atomicReduce(float* output, float accum, SumReducer<float
 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
  atomicAdd(output, accum);
 #else // EIGEN_CUDA_ARCH >= 300
+  EIGEN_UNUSED_VARIABLE(output);
+  EIGEN_UNUSED_VARIABLE(accum);
  gpu_assert(0 && "Shouldn't be called on unsupported device");
 #endif // EIGEN_CUDA_ARCH >= 300
 }
@ -209,6 +214,11 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer
 #endif
  }
 #else // EIGEN_CUDA_ARCH >= 300
+  EIGEN_UNUSED_VARIABLE(reducer);
+  EIGEN_UNUSED_VARIABLE(input);
+  EIGEN_UNUSED_VARIABLE(num_coeffs);
+  EIGEN_UNUSED_VARIABLE(output);
+  EIGEN_UNUSED_VARIABLE(semaphore);
  gpu_assert(0 && "Shouldn't be called on unsupported device");
 #endif // EIGEN_CUDA_ARCH >= 300
 }
@ -243,7 +253,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFlo

 template <typename Self,
          typename Reducer, typename Index>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self /*input*/, Index num_coeffs, half* output) {
  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
  const Index num_threads = blockDim.x * gridDim.x;
  typedef typename packet_traits<Eigen::half>::type PacketType;
@ -715,11 +725,11 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc
        half2* hr2 = reinterpret_cast<half2*>(&r2);
        half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
        half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
-        for (int i = 0; i < packet_width / 2; i++) {
-          hr1[i] =
-              __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
-          hr2[i] =
-              __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
+        for (int j = 0; j < packet_width / 2; j++) {
+          hr1[j] =
+              __shfl_down_sync(0xFFFFFFFF, rr1[j], (unsigned)offset, warpSize);
+          hr2[j] =
+              __shfl_down_sync(0xFFFFFFFF, rr2[j], (unsigned)offset, warpSize);
        }
        reducer.reducePacket(r1, &reduced_val1);
        reducer.reducePacket(r2, &reduced_val2);
@ -744,7 +754,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reduc
      val = __halves2half2(val1, val2);
      if ((threadIdx.x & (warpSize - 1)) == 0) {
        half* loc = output + row;
-        atomicReduce((half2*)loc, val, reducer);
+        atomicReduce(reinterpret_cast<half2*>(loc), val, reducer);
      }
    }
  }
@ -782,12 +792,12 @@ struct InnerReductionLauncher<
    if (num_blocks > 1) {
      // We initialize the outputs outside the reduction kernel when we can't be sure that there
      // won't be a race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumGpuMultiProcessors() *
+      const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks2 = device.getNumGpuMultiProcessors() *
                           device.maxGpuThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
      LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
-                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_blocks2, 1024, 0, device, reducer.initialize(),
                         num_preserved_vals, output);
    }

@ -950,12 +960,12 @@ struct OuterReducer<Self, Op, GpuDevice> {
    if (num_blocks > 1) {
      // We initialize the outputs in the reduction kernel itself when we don't have to worry
      // about race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumGpuMultiProcessors() *
+      const int dyn_blocks2 = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks2 = device.getNumGpuMultiProcessors() *
                             device.maxGpuThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
      LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
-                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_blocks2, 1024, 0, device, reducer.initialize(),
                         num_preserved_vals, output);
    }

--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@ -191,7 +191,7 @@ template <typename Self, typename Reducer, typename Device,
              (TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess &&
               internal::reducer_traits<Reducer, Device>::PacketAccess)>
 struct ScanLauncher {
-  void operator()(Self& self, typename Self::CoeffReturnType* data) {
+  void operator()(Self& self, typename Self::CoeffReturnType* data) const {
    Index total_size = internal::array_prod(self.dimensions());

    // We fix the index along the scan axis to 0 and perform a