This commit contains the following (HIP specific) updates:

- unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
  Changing "pass-by-reference" argument to be "pass-by-value" instead
  (in a  __global__ function decl).
  "pass-by-reference" arguments to __global__ functions are unwise,
  and will be explicitly flagged as errors by the newer versions of HIP.

- Eigen/src/Core/util/Memory.h
- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
  Changes introduced in recent commits breaks the HIP compile.
  Adding EIGEN_DEVICE_FUNC attribute to some functions and
  calling ::malloc/free instead of the corresponding std:: versions
  to get the HIP compile working again

- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
  Change introduced a recent commit breaks the HIP compile
  (link stage errors out due to failure to inline a function).
  Disabling the recently introduced code (only for HIP compile), to get
  the eigen nightly testing going again.
  Will submit another PR once we have te proper fix.

- Eigen/src/Core/util/ConfigureVectorization.h
  Enabling GPU VECTOR support when HIP compiler is in use
  (for both the host and device compile phases)
This commit is contained in:
Deven Desai 2018-10-01 14:28:37 +00:00
parent e95696acb3
commit 94898488a6
5 changed files with 26 additions and 10 deletions

View File

@ -379,10 +379,12 @@
#include <cuda_fp16.h> #include <cuda_fp16.h>
#endif #endif
#if defined(EIGEN_HIP_DEVICE_COMPILE) #if defined(EIGEN_HIPCC)
#define EIGEN_VECTORIZE_GPU #define EIGEN_VECTORIZE_GPU
#include <hip/hip_vector_types.h> #include <hip/hip_vector_types.h>
#endif
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#define EIGEN_HAS_HIP_FP16 #define EIGEN_HAS_HIP_FP16
#include <hip/hip_fp16.h> #include <hip/hip_fp16.h>

View File

@ -96,10 +96,16 @@ inline void throw_std_bad_alloc()
/** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned. /** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned.
* Fast, but wastes 16 additional bytes of memory. Does not throw any exception. * Fast, but wastes 16 additional bytes of memory. Does not throw any exception.
*/ */
inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES) EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES)
{ {
eigen_assert(alignment >= sizeof(void*) && (alignment & -alignment) == alignment && "Alignment must be at least sizeof(void*) and a power of 2"); eigen_assert(alignment >= sizeof(void*) && (alignment & -alignment) == alignment && "Alignment must be at least sizeof(void*) and a power of 2");
#if defined(EIGEN_HIP_DEVICE_COMPILE)
void *original = ::malloc(size+alignment);
#else
void *original = std::malloc(size+alignment); void *original = std::malloc(size+alignment);
#endif
if (original == 0) return 0; if (original == 0) return 0;
void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(alignment-1))) + alignment); void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(alignment-1))) + alignment);
*(reinterpret_cast<void**>(aligned) - 1) = original; *(reinterpret_cast<void**>(aligned) - 1) = original;
@ -107,9 +113,15 @@ inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = E
} }
/** \internal Frees memory allocated with handmade_aligned_malloc */ /** \internal Frees memory allocated with handmade_aligned_malloc */
inline void handmade_aligned_free(void *ptr) EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void *ptr)
{ {
if (ptr) std::free(*(reinterpret_cast<void**>(ptr) - 1)); if (ptr) {
#if defined(EIGEN_HIP_DEVICE_COMPILE)
::free(*(reinterpret_cast<void**>(ptr) - 1));
#else
std::free(*(reinterpret_cast<void**>(ptr) - 1));
#endif
}
} }
/** \internal /** \internal

View File

@ -186,21 +186,21 @@ struct TensorContractionKernel {
/*ConjugateLhs*/ false, /*ConjugateRhs*/ false> /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
GebpKernel; GebpKernel;
EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
static void packLhs(LhsScalar* lhsBlock, static void packLhs(LhsScalar* lhsBlock,
const typename LhsMapper::SubMapper& data_mapper, const typename LhsMapper::SubMapper& data_mapper,
const StorageIndex depth, const StorageIndex rows) { const StorageIndex depth, const StorageIndex rows) {
LhsPacker()(lhsBlock, data_mapper, depth, rows, /*stride*/ 0, /*offset*/ 0); LhsPacker()(lhsBlock, data_mapper, depth, rows, /*stride*/ 0, /*offset*/ 0);
} }
EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
static void packRhs(RhsScalar* rhsBlock, static void packRhs(RhsScalar* rhsBlock,
const typename RhsMapper::SubMapper& data_mapper, const typename RhsMapper::SubMapper& data_mapper,
const StorageIndex depth, const StorageIndex cols) { const StorageIndex depth, const StorageIndex cols) {
RhsPacker()(rhsBlock, data_mapper, depth, cols); RhsPacker()(rhsBlock, data_mapper, depth, cols);
} }
EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE
static void invoke(const OutputMapper& output_mapper, static void invoke(const OutputMapper& output_mapper,
const LhsScalar* lhsBlock, const RhsScalar* rhsBlock, const LhsScalar* lhsBlock, const RhsScalar* rhsBlock,
const StorageIndex rows, const StorageIndex depth, const StorageIndex rows, const StorageIndex depth,

View File

@ -218,6 +218,7 @@ struct InnerMostDimReducer<Self, Op, false, true> {
} }
}; };
#if !defined(EIGEN_HIPCC)
template <typename Self, typename Op> template <typename Self, typename Op>
struct InnerMostDimReducer<Self, Op, true, true> { struct InnerMostDimReducer<Self, Op, true, true> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
@ -257,6 +258,7 @@ struct InnerMostDimReducer<Self, Op, true, true> {
} }
} }
}; };
#endif
template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)> template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
struct InnerMostDimPreserver { struct InnerMostDimPreserver {

View File

@ -292,7 +292,7 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
} }
template <typename Op> template <typename Op>
__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) { __global__ void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half2* scratch) {
eigen_assert(threadIdx.x == 1); eigen_assert(threadIdx.x == 1);
half tmp = __low2half(*scratch); half tmp = __low2half(*scratch);
reducer.reduce(__high2half(*scratch), &tmp); reducer.reduce(__high2half(*scratch), &tmp);