From 94898488a6fe3096a7a44d0bb108e514f0e44699 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Mon, 1 Oct 2018 14:28:37 +0000 Subject: [PATCH] This commit contains the following (HIP specific) updates: - unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h Changing "pass-by-reference" argument to be "pass-by-value" instead (in a __global__ function decl). "pass-by-reference" arguments to __global__ functions are unwise, and will be explicitly flagged as errors by the newer versions of HIP. - Eigen/src/Core/util/Memory.h - unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h Changes introduced in recent commits breaks the HIP compile. Adding EIGEN_DEVICE_FUNC attribute to some functions and calling ::malloc/free instead of the corresponding std:: versions to get the HIP compile working again - unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h Change introduced a recent commit breaks the HIP compile (link stage errors out due to failure to inline a function). Disabling the recently introduced code (only for HIP compile), to get the eigen nightly testing going again. Will submit another PR once we have te proper fix. - Eigen/src/Core/util/ConfigureVectorization.h Enabling GPU VECTOR support when HIP compiler is in use (for both the host and device compile phases) --- Eigen/src/Core/util/ConfigureVectorization.h | 6 ++++-- Eigen/src/Core/util/Memory.h | 18 +++++++++++++++--- .../Eigen/CXX11/src/Tensor/TensorContraction.h | 6 +++--- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 4 +++- .../CXX11/src/Tensor/TensorReductionGpu.h | 2 +- 5 files changed, 26 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index e75c7d89e..a2743624e 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -379,10 +379,12 @@ #include #endif -#if defined(EIGEN_HIP_DEVICE_COMPILE) - +#if defined(EIGEN_HIPCC) #define EIGEN_VECTORIZE_GPU #include +#endif + +#if defined(EIGEN_HIP_DEVICE_COMPILE) #define EIGEN_HAS_HIP_FP16 #include diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 9dd2e0252..c624556c5 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -96,10 +96,16 @@ inline void throw_std_bad_alloc() /** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned. * Fast, but wastes 16 additional bytes of memory. Does not throw any exception. */ -inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES) +EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES) { eigen_assert(alignment >= sizeof(void*) && (alignment & -alignment) == alignment && "Alignment must be at least sizeof(void*) and a power of 2"); + +#if defined(EIGEN_HIP_DEVICE_COMPILE) + void *original = ::malloc(size+alignment); +#else void *original = std::malloc(size+alignment); +#endif + if (original == 0) return 0; void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(alignment-1))) + alignment); *(reinterpret_cast(aligned) - 1) = original; @@ -107,9 +113,15 @@ inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = E } /** \internal Frees memory allocated with handmade_aligned_malloc */ -inline void handmade_aligned_free(void *ptr) +EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void *ptr) { - if (ptr) std::free(*(reinterpret_cast(ptr) - 1)); + if (ptr) { +#if defined(EIGEN_HIP_DEVICE_COMPILE) + ::free(*(reinterpret_cast(ptr) - 1)); +#else + std::free(*(reinterpret_cast(ptr) - 1)); +#endif + } } /** \internal diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index b92753c44..6fc1e4a6e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -186,21 +186,21 @@ struct TensorContractionKernel { /*ConjugateLhs*/ false, /*ConjugateRhs*/ false> GebpKernel; - EIGEN_DONT_INLINE + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void packLhs(LhsScalar* lhsBlock, const typename LhsMapper::SubMapper& data_mapper, const StorageIndex depth, const StorageIndex rows) { LhsPacker()(lhsBlock, data_mapper, depth, rows, /*stride*/ 0, /*offset*/ 0); } - EIGEN_DONT_INLINE + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void packRhs(RhsScalar* rhsBlock, const typename RhsMapper::SubMapper& data_mapper, const StorageIndex depth, const StorageIndex cols) { RhsPacker()(rhsBlock, data_mapper, depth, cols); } - EIGEN_DONT_INLINE + EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void invoke(const OutputMapper& output_mapper, const LhsScalar* lhsBlock, const RhsScalar* rhsBlock, const StorageIndex rows, const StorageIndex depth, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 949764f3a..2c69e4fd4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -218,6 +218,7 @@ struct InnerMostDimReducer { } }; +#if !defined(EIGEN_HIPCC) template struct InnerMostDimReducer { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType @@ -257,7 +258,8 @@ struct InnerMostDimReducer { } } }; - +#endif + template struct InnerMostDimPreserver { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index 88940e6e6..375c570b3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -292,7 +292,7 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, } template -__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) { +__global__ void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half2* scratch) { eigen_assert(threadIdx.x == 1); half tmp = __low2half(*scratch); reducer.reduce(__high2half(*scratch), &tmp);