This commit contains the following (HIP specific) updates:

- unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h Changing "pass-by-reference" argument to be "pass-by-value" instead (in a __global__ function decl). "pass-by-reference" arguments to __global__ functions are unwise, and will be explicitly flagged as errors by the newer versions of HIP. - Eigen/src/Core/util/Memory.h - unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h Changes introduced in recent commits breaks the HIP compile. Adding EIGEN_DEVICE_FUNC attribute to some functions and calling ::malloc/free instead of the corresponding std:: versions to get the HIP compile working again - unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h Change introduced a recent commit breaks the HIP compile (link stage errors out due to failure to inline a function). Disabling the recently introduced code (only for HIP compile), to get the eigen nightly testing going again. Will submit another PR once we have te proper fix. - Eigen/src/Core/util/ConfigureVectorization.h Enabling GPU VECTOR support when HIP compiler is in use (for both the host and device compile phases)
2024-12-21 07:19:46 +08:00 · 2018-10-01 14:28:37 +00:00 · 2018-10-01 14:28:37 +00:00 · 94898488a6
commit 94898488a6
parent e95696acb3
5 changed files with 26 additions and 10 deletions
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@ -379,10 +379,12 @@
  #include <cuda_fp16.h>
 #endif

-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
+#if defined(EIGEN_HIPCC)
  #define EIGEN_VECTORIZE_GPU
  #include <hip/hip_vector_types.h>
+#endif
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)

  #define EIGEN_HAS_HIP_FP16
  #include <hip/hip_fp16.h>
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@ -96,10 +96,16 @@ inline void throw_std_bad_alloc()
 /** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned.
  * Fast, but wastes 16 additional bytes of memory. Does not throw any exception.
  */
-inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES)
+EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES)
 {
  eigen_assert(alignment >= sizeof(void*) && (alignment & -alignment) == alignment && "Alignment must be at least sizeof(void*) and a power of 2");
+  
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+  void *original = ::malloc(size+alignment);
+#else
  void *original = std::malloc(size+alignment);
+#endif
+  
  if (original == 0) return 0;
  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(alignment-1))) + alignment);
  *(reinterpret_cast<void**>(aligned) - 1) = original;
@ -107,9 +113,15 @@ inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = E
 }

 /** \internal Frees memory allocated with handmade_aligned_malloc */
-inline void handmade_aligned_free(void *ptr)
+EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void *ptr)
 {
-  if (ptr) std::free(*(reinterpret_cast<void**>(ptr) - 1));
+  if (ptr) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+    ::free(*(reinterpret_cast<void**>(ptr) - 1));
+#else
+    std::free(*(reinterpret_cast<void**>(ptr) - 1));
+#endif
+  }
 }

 /** \internal
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@ -186,21 +186,21 @@ struct TensorContractionKernel {
      /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
      GebpKernel;

-  EIGEN_DONT_INLINE
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
  static void packLhs(LhsScalar* lhsBlock,
                      const typename LhsMapper::SubMapper& data_mapper,
                      const StorageIndex depth, const StorageIndex rows) {
    LhsPacker()(lhsBlock, data_mapper, depth, rows, /*stride*/ 0, /*offset*/ 0);
  }

-  EIGEN_DONT_INLINE
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
  static void packRhs(RhsScalar* rhsBlock,
                      const typename RhsMapper::SubMapper& data_mapper,
                      const StorageIndex depth, const StorageIndex cols) {
    RhsPacker()(rhsBlock, data_mapper, depth, cols);
  }

-  EIGEN_DONT_INLINE
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
  static void invoke(const OutputMapper& output_mapper,
                     const LhsScalar* lhsBlock, const RhsScalar* rhsBlock,
                     const StorageIndex rows, const StorageIndex depth,
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@ -218,6 +218,7 @@ struct InnerMostDimReducer<Self, Op, false, true> {
  }
 };

+#if !defined(EIGEN_HIPCC) 
 template <typename Self, typename Op>
 struct InnerMostDimReducer<Self, Op, true, true> {
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
@ -257,7 +258,8 @@ struct InnerMostDimReducer<Self, Op, true, true> {
    }
  }
 };
-
+#endif
+ 
 template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
 struct InnerMostDimPreserver {
  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@ -292,7 +292,7 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
 }

 template <typename Op>
-__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
+__global__ void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half2* scratch) {
  eigen_assert(threadIdx.x == 1);
  half tmp = __low2half(*scratch);
  reducer.reduce(__high2half(*scratch), &tmp);