Adding support for using Eigen in HIP kernels.

This commit enables the use of Eigen on HIP kernels / AMD GPUs. Support has been added along the same lines as what already exists for using Eigen in CUDA kernels / NVidia GPUs.

Application code needs to explicitly define EIGEN_USE_HIP when using Eigen in HIP kernels. This is because some of the CUDA headers get picked up by default during Eigen compile (irrespective of whether or not the underlying compiler is CUDACC/NVCC, for e.g. Eigen/src/Core/arch/CUDA/Half.h). In order to maintain this behavior, the EIGEN_USE_HIP macro is used to switch to using the HIP version of those header files (see Eigen/Core and unsupported/Eigen/CXX11/Tensor)


Use the "-DEIGEN_TEST_HIP" cmake option to enable the HIP specific unit tests.
This commit is contained in:
Deven Desai 2018-06-06 10:12:58 -04:00
parent e206f8d4a4
commit 8fbd47052b
50 changed files with 9527 additions and 94 deletions

View File

@ -22,6 +22,17 @@
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
#endif
#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
// analogous to EIGEN_CUDACC, but for HIP
#define EIGEN_HIPCC __HIPCC__
#endif
// NVCC is not supported as the target platform for HIPCC
// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive
#if defined(__NVCC__) && defined(__HIPCC__)
#error "NVCC as the target platform for HIPCC is currently not supported."
#endif
// Starting with CUDA 9 the composite __CUDACC_VER__ is not available.
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
@ -32,8 +43,8 @@
#endif
// Handle NVCC/CUDA/SYCL
#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__)
// Do not try asserts on CUDA and SYCL!
#if defined(EIGEN_CUDACC) || defined(__SYCL_DEVICE_ONLY__) || defined(EIGEN_HIPCC)
// Do not try asserts on CUDA, HIP and SYCL!
#ifndef EIGEN_NO_DEBUG
#define EIGEN_NO_DEBUG
#endif
@ -57,6 +68,26 @@
// We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro
// works properly on the device side
#include <cuda_runtime.h>
#elif defined(EIGEN_HIPCC)
// Do not try to vectorize on HIP
#ifndef EIGEN_DONT_VECTORIZE
#define EIGEN_DONT_VECTORIZE
#endif
#define EIGEN_DEVICE_FUNC __host__ __device__
// We need hip_runtime.h to ensure that that EIGEN_USING_STD_MATH macro
// works properly on the device side
#include <hip/hip_runtime.h>
#if defined(__HIP_DEVICE_COMPILE__) && !defined(EIGEN_NO_HIP)
// analogous to EIGEN_CUDA_ARCH, but for HIP
#define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
// Note this check needs to come after we include hip_runtime.h since
// hip_runtime.h includes hip_common.h which in turn has the define
// for __HIP_DEVICE_COMPILE__
#endif
#else
#define EIGEN_DEVICE_FUNC
#endif
@ -68,16 +99,16 @@
#define EIGEN_DONT_VECTORIZE
#endif
// When compiling CUDA device code with NVCC, pull in math functions from the
// global namespace. In host mode, and when device doee with clang, use the
// std versions.
#if defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)
// When compiling CUDA device code with NVCC, or HIP device code with HIPCC
// pull in math functions from the global namespace. In host mode, and when
// device doee with clang, use the std versions.
#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIPCC__))
#define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
#else
#define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
#endif
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE)
#define EIGEN_EXCEPTIONS
#endif
@ -270,6 +301,17 @@
#include <cuda_fp16.h>
#endif
#if defined(EIGEN_HIPCC) && defined(EIGEN_HIP_DEVICE_COMPILE)
#define EIGEN_HAS_HIP_FP16
#include <hip/hip_fp16.h>
#define HIP_PATCH_WITH_NEW_FP16 18215
#if (HIP_VERSION_PATCH < HIP_PATCH_WITH_NEW_FP16)
#define EIGEN_HAS_OLD_HIP_FP16
// Old HIP implementation does not have a explicit typedef for "half2"
typedef __half2 half2;
#endif
#endif
#if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
#define EIGEN_HAS_OPENMP
#endif
@ -390,7 +432,6 @@ using std::ptrdiff_t;
#include "src/Core/util/IntegralConstant.h"
#include "src/Core/util/SymbolicIndex.h"
#include "src/Core/NumTraits.h"
#include "src/Core/MathFunctions.h"
#include "src/Core/GenericPacketMath.h"
@ -434,9 +475,15 @@ using std::ptrdiff_t;
#endif
// Half float support
#include "src/Core/arch/CUDA/Half.h"
#include "src/Core/arch/CUDA/PacketMathHalf.h"
#include "src/Core/arch/CUDA/TypeCasting.h"
#if defined EIGEN_USE_HIP
#include "src/Core/arch/HIP/hcc/Half.h"
#include "src/Core/arch/HIP/hcc/PacketMathHalf.h"
#include "src/Core/arch/HIP/hcc/TypeCasting.h"
#else
#include "src/Core/arch/CUDA/Half.h"
#include "src/Core/arch/CUDA/PacketMathHalf.h"
#include "src/Core/arch/CUDA/TypeCasting.h"
#endif
#if defined EIGEN_VECTORIZE_CUDA
#include "src/Core/arch/CUDA/PacketMath.h"

View File

@ -299,7 +299,11 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu
{ pstore(to, from); }
/** \internal tries to do cache prefetching of \a addr */
template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
template<typename Scalar>
#if !defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
inline void prefetch(const Scalar* addr)
{
#ifdef EIGEN_CUDA_ARCH
#if defined(__LP64__)
@ -528,7 +532,7 @@ inline void palign(PacketType& first, const PacketType& second)
***************************************************************************/
// Eigen+CUDA does not support complexes.
#ifndef EIGEN_CUDACC
#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)
template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }

View File

@ -96,7 +96,7 @@ struct real_default_impl<Scalar,true>
template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
#ifdef EIGEN_CUDA_ARCH
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
template<typename T>
struct real_impl<std::complex<T> >
{
@ -144,7 +144,7 @@ struct imag_default_impl<Scalar,true>
template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
#ifdef EIGEN_CUDA_ARCH
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
template<typename T>
struct imag_impl<std::complex<T> >
{
@ -260,7 +260,7 @@ struct conj_default_impl<Scalar,true>
template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {};
#ifdef EIGEN_CUDA_ARCH
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
template<typename T>
struct conj_impl<std::complex<T> >
{
@ -435,7 +435,12 @@ struct round_retval
struct arg_impl {
static inline Scalar run(const Scalar& x)
{
#if defined(EIGEN_HIP_DEVICE_COMPILE)
// HIP does not seem to have a native device side implementation for the math routine "arg"
using std::arg;
#else
EIGEN_USING_STD_MATH(arg);
#endif
return arg(x);
}
};
@ -768,7 +773,9 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isfinite_impl(const T& x)
{
#ifdef EIGEN_CUDA_ARCH
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return isfinite(x);
#elif defined(EIGEN_CUDA_ARCH)
return (::isfinite)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isfinite;
@ -783,7 +790,9 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isinf_impl(const T& x)
{
#ifdef EIGEN_CUDA_ARCH
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return isinf(x);
#elif defined(EIGEN_CUDA_ARCH)
return (::isinf)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isinf;
@ -798,7 +807,9 @@ EIGEN_DEVICE_FUNC
typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
isnan_impl(const T& x)
{
#ifdef EIGEN_CUDA_ARCH
#if defined(EIGEN_HIP_DEVICE_COMPILE)
return isnan(x);
#elif defined(EIGEN_CUDA_ARCH)
return (::isnan)(x);
#elif EIGEN_USE_STD_FPCLASSIFY
using std::isnan;
@ -864,7 +875,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
namespace numext {
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE) && !defined(__SYCL_DEVICE_ONLY__)
template<typename T>
EIGEN_DEVICE_FUNC
EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
@ -1078,7 +1089,7 @@ EIGEN_ALWAYS_INLINE float log1p(float x) { return cl::sycl::log1p(x); }
EIGEN_ALWAYS_INLINE double log1p(double x) { return cl::sycl::log1p(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float log1p(const float &x) { return ::log1pf(x); }
@ -1136,7 +1147,7 @@ EIGEN_ALWAYS_INLINE float floor(float x) { return cl::sycl::floor(x); }
EIGEN_ALWAYS_INLINE double floor(double x) { return cl::sycl::floor(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float floor(const float &x) { return ::floorf(x); }
@ -1157,7 +1168,7 @@ EIGEN_ALWAYS_INLINE float ceil(float x) { return cl::sycl::ceil(x); }
EIGEN_ALWAYS_INLINE double ceil(double x) { return cl::sycl::ceil(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float ceil(const float &x) { return ::ceilf(x); }
@ -1215,7 +1226,7 @@ EIGEN_ALWAYS_INLINE double log(double x) { return cl::sycl::log(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float log(const float &x) { return ::logf(x); }
@ -1243,7 +1254,7 @@ EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); }
EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float abs(const float &x) { return ::fabsf(x); }
@ -1273,7 +1284,7 @@ EIGEN_ALWAYS_INLINE float exp(float x) { return cl::sycl::exp(x); }
EIGEN_ALWAYS_INLINE double exp(double x) { return cl::sycl::exp(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float exp(const float &x) { return ::expf(x); }
@ -1309,7 +1320,7 @@ EIGEN_ALWAYS_INLINE float expm1(float x) { return cl::sycl::expm1(x); }
EIGEN_ALWAYS_INLINE double expm1(double x) { return cl::sycl::expm1(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float expm1(const float &x) { return ::expm1f(x); }
@ -1329,7 +1340,7 @@ EIGEN_ALWAYS_INLINE float cos(float x) { return cl::sycl::cos(x); }
EIGEN_ALWAYS_INLINE double cos(double x) { return cl::sycl::cos(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float cos(const float &x) { return ::cosf(x); }
@ -1349,7 +1360,7 @@ EIGEN_ALWAYS_INLINE float sin(float x) { return cl::sycl::sin(x); }
EIGEN_ALWAYS_INLINE double sin(double x) { return cl::sycl::sin(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float sin(const float &x) { return ::sinf(x); }
@ -1369,7 +1380,7 @@ EIGEN_ALWAYS_INLINE float tan(float x) { return cl::sycl::tan(x); }
EIGEN_ALWAYS_INLINE double tan(double x) { return cl::sycl::tan(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tan(const float &x) { return ::tanf(x); }
@ -1400,7 +1411,7 @@ EIGEN_ALWAYS_INLINE float acosh(float x) { return cl::sycl::acosh(x); }
EIGEN_ALWAYS_INLINE double acosh(double x) { return cl::sycl::acosh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float acos(const float &x) { return ::acosf(x); }
@ -1431,7 +1442,7 @@ EIGEN_ALWAYS_INLINE float asinh(float x) { return cl::sycl::asinh(x); }
EIGEN_ALWAYS_INLINE double asinh(double x) { return cl::sycl::asinh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float asin(const float &x) { return ::asinf(x); }
@ -1462,7 +1473,7 @@ EIGEN_ALWAYS_INLINE float atanh(float x) { return cl::sycl::atanh(x); }
EIGEN_ALWAYS_INLINE double atanh(double x) { return cl::sycl::atanh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float atan(const float &x) { return ::atanf(x); }
@ -1483,7 +1494,7 @@ EIGEN_ALWAYS_INLINE float cosh(float x) { return cl::sycl::cosh(x); }
EIGEN_ALWAYS_INLINE double cosh(double x) { return cl::sycl::cosh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float cosh(const float &x) { return ::coshf(x); }
@ -1503,7 +1514,7 @@ EIGEN_ALWAYS_INLINE float sinh(float x) { return cl::sycl::sinh(x); }
EIGEN_ALWAYS_INLINE double sinh(double x) { return cl::sycl::sinh(x); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float sinh(const float &x) { return ::sinhf(x); }
@ -1521,12 +1532,12 @@ T tanh(const T &x) {
#if defined(__SYCL_DEVICE_ONLY__)
EIGEN_ALWAYS_INLINE float tanh(float x) { return cl::sycl::tanh(x); }
EIGEN_ALWAYS_INLINE double tanh(double x) { return cl::sycl::tanh(x); }
#elif (!defined(EIGEN_CUDACC)) && EIGEN_FAST_MATH
#elif (!defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)) && EIGEN_FAST_MATH
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tanh(float x) { return internal::generic_fast_tanh_float(x); }
#endif
#ifdef EIGEN_CUDACC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float tanh(const float &x) { return ::tanhf(x); }
@ -1546,7 +1557,7 @@ EIGEN_ALWAYS_INLINE float fmod(float x, float y) { return cl::sycl::fmod(x, y)
EIGEN_ALWAYS_INLINE double fmod(double x, double y) { return cl::sycl::fmod(x, y); }
#endif // defined(__SYCL_DEVICE_ONLY__)
#ifdef EIGEN_CUDACC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float fmod(const float& a, const float& b) {

View File

@ -137,6 +137,9 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
{
typedef Product<Lhs,Rhs,Options> SrcXprType;
#if defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
static EIGEN_STRONG_INLINE
void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
{
@ -390,6 +393,9 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
typedef typename Product<Lhs,Rhs>::Scalar Scalar;
template<typename Dst>
#if defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
{
// Same as: dst.noalias() = lhs.lazyProduct(rhs);

View File

@ -0,0 +1,705 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
//
// The conversion routines are Copyright (c) Fabian Giesen, 2016.
// The original license follows:
//
// Copyright (c) Fabian Giesen, 2016
// All rights reserved.
// Redistribution and use in source and binary forms, with or without
// modification, are permitted.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Standard 16-bit float type, mostly useful for GPUs. Defines a new
// type Eigen::half (inheriting from HIP's __half struct) with
// operator overloads such that it behaves basically as an arithmetic
// type. It will be quite slow on CPUs (so it is recommended to stay
// in fp32 for CPUs, except for simple parameter conversions, I/O
// to disk and the likes), but fast on GPUs.
#ifndef EIGEN_HALF_HIP_H
#define EIGEN_HALF_HIP_H
#if __cplusplus > 199711L
#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
#else
#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type()
#endif
namespace Eigen {
struct half;
namespace half_impl {
#if !defined(EIGEN_HAS_HIP_FP16)
// Make our own __half_raw definition that is similar to CUDA's.
struct __half_raw {
EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
unsigned short x;
};
#elif defined(EIGEN_HAS_OLD_HIP_FP16)
// Make a __half_raw definition that is
// ++ compatible with that of Eigen and
// ++ add a implcit conversion to the native __half of the old HIP implementation.
//
// Keeping ".x" as "unsigned short" keeps the interface the same between the Eigen and HIP implementation.
//
// In the old HIP implementation,
// ++ __half is a typedef of __fp16
// ++ the "__h*" routines take "__half" arguments
// so we need to implicitly convert "__half_raw" to "__half" to avoid having to explicitly make
// that conversiion in each call to a "__h*" routine...that is why we have "operator __half" routine
struct __half_raw {
EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
union {
unsigned short x;
__half data;
};
operator __half(void) const { return data; }
};
#endif
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
struct half_base : public __half_raw {
EIGEN_DEVICE_FUNC half_base() {}
EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
#if defined(EIGEN_HAS_HIP_FP16)
#if defined(EIGEN_HAS_OLD_HIP_FP16)
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(__half_as_ushort(h)) {}
#else
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
#endif
#endif
};
} // namespace half_impl
// Class definition.
struct half : public half_impl::half_base {
#if !defined(EIGEN_HAS_HIP_FP16) || defined(EIGEN_HAS_OLD_HIP_FP16)
typedef half_impl::__half_raw __half_raw;
#endif
EIGEN_DEVICE_FUNC half() {}
EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
#if defined(EIGEN_HAS_HIP_FP16)
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
#endif
explicit EIGEN_DEVICE_FUNC half(bool b)
: half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
template<class T>
explicit EIGEN_DEVICE_FUNC half(const T& val)
: half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(val))) {}
explicit EIGEN_DEVICE_FUNC half(float f)
: half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
// +0.0 and -0.0 become false, everything else becomes true.
return (x & 0x7fff) != 0;
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const {
return static_cast<signed char>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const {
return static_cast<unsigned char>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const {
return static_cast<short>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const {
return static_cast<unsigned short>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const {
return static_cast<int>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const {
return static_cast<unsigned int>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const {
return static_cast<long>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
return static_cast<unsigned long>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
return static_cast<long long>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
return static_cast<unsigned long long>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
return half_impl::half_to_float(*this);
}
EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
return static_cast<double>(half_impl::half_to_float(*this));
}
EIGEN_DEVICE_FUNC half& operator=(const half& other) {
x = other.x;
return *this;
}
};
} // end namespace Eigen
namespace std {
template<>
struct numeric_limits<Eigen::half> {
static const bool is_specialized = true;
static const bool is_signed = true;
static const bool is_integer = false;
static const bool is_exact = false;
static const bool has_infinity = true;
static const bool has_quiet_NaN = true;
static const bool has_signaling_NaN = true;
static const float_denorm_style has_denorm = denorm_present;
static const bool has_denorm_loss = false;
static const std::float_round_style round_style = std::round_to_nearest;
static const bool is_iec559 = false;
static const bool is_bounded = false;
static const bool is_modulo = false;
static const int digits = 11;
static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
static const int radix = 2;
static const int min_exponent = -13;
static const int min_exponent10 = -4;
static const int max_exponent = 16;
static const int max_exponent10 = 4;
static const bool traps = true;
static const bool tinyness_before = false;
static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); }
static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); }
static Eigen::half round_error() { return Eigen::half(0.5); }
static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); }
};
// If std::numeric_limits<T> is specialized, should also specialize
// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
// std::numeric_limits<const volatile T>
// https://stackoverflow.com/a/16519653/
template<>
struct numeric_limits<const Eigen::half> : numeric_limits<Eigen::half> {};
template<>
struct numeric_limits<volatile Eigen::half> : numeric_limits<Eigen::half> {};
template<>
struct numeric_limits<const volatile Eigen::half> : numeric_limits<Eigen::half> {};
} // end namespace std
namespace Eigen {
namespace half_impl {
#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
// Intrinsics for native fp16 support. Note that on current hardware,
// these are no faster than fp32 arithmetic (you need to use the half2
// versions to get the ALU speed increased), but you do save the
// conversion steps back and forth.
EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
return __hadd(a, b);
}
EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
return __hmul(a, b);
}
EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
return __hsub(a, b);
}
EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
float num = __half2float(a);
float denom = __half2float(b);
return __float2half(num / denom);
}
EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
return __hneg(a);
}
EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) {
a = a + b;
return a;
}
EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) {
a = a * b;
return a;
}
EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) {
a = a - b;
return a;
}
EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) {
a = a / b;
return a;
}
EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) {
return __heq(a, b);
}
EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) {
return __hne(a, b);
}
EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) {
return __hlt(a, b);
}
EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) {
return __hle(a, b);
}
EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) {
return __hgt(a, b);
}
EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
return __hge(a, b);
}
#else // Emulate support for half floats
// Definitions for CPUs mostly working through conversion to/from fp32.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
return half(float(a) + float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) {
return half(float(a) * float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) {
return half(float(a) - float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) {
return half(float(a) / float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator - (const half& a) {
half result;
result.x = a.x ^ 0x8000;
return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) {
a = half(float(a) + float(b));
return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) {
a = half(float(a) * float(b));
return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) {
a = half(float(a) - float(b));
return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) {
a = half(float(a) / float(b));
return a;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) {
return numext::equal_strict(float(a),float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) {
return numext::not_equal_strict(float(a), float(b));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) {
return float(a) < float(b);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator <= (const half& a, const half& b) {
return float(a) <= float(b);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) {
return float(a) > float(b);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const half& b) {
return float(a) >= float(b);
}
#endif // Emulate support for half floats
// Division by an index. Do it in full float precision to avoid accuracy
// issues in converting the denominator to half.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) {
return half(static_cast<float>(a) / static_cast<float>(b));
}
// Conversion routines, including fallbacks for the host or older CUDA.
// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
// these in hardware. If we need more performance on older/other CPUs, they are
// also possible to vectorize directly.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) {
__half_raw h;
h.x = x;
return h;
}
union FP32 {
unsigned int u;
float f;
};
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
__half tmp_ff = __float2half(ff);
#if defined(EIGEN_HAS_OLD_HIP_FP16)
__half_raw h;
h.data = tmp_ff;
return h;
#else
return *(__half_raw*)&tmp_ff;
#endif
#elif defined(EIGEN_HAS_FP16_C)
__half_raw h;
h.x = _cvtss_sh(ff, 0);
return h;
#else
FP32 f; f.f = ff;
const FP32 f32infty = { 255 << 23 };
const FP32 f16max = { (127 + 16) << 23 };
const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 };
unsigned int sign_mask = 0x80000000u;
__half_raw o;
o.x = static_cast<unsigned short>(0x0u);
unsigned int sign = f.u & sign_mask;
f.u ^= sign;
// NOTE all the integer compares in this function can be safely
// compiled into signed compares since all operands are below
// 0x80000000. Important if you want fast straight SSE2 code
// (since there's no unsigned PCMPGTD).
if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set)
o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
} else { // (De)normalized number or zero
if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero
// use a magic value to align our 10 mantissa bits at the bottom of
// the float. as long as FP addition is round-to-nearest-even this
// just works.
f.f += denorm_magic.f;
// and one integer subtract of the bias later, we have our final float!
o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
} else {
unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd
// update exponent, rounding bias part 1
f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
// rounding bias part 2
f.u += mant_odd;
// take the bits!
o.x = static_cast<unsigned short>(f.u >> 13);
}
}
o.x |= static_cast<unsigned short>(sign >> 16);
return o;
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
return __half2float(h);
#elif defined(EIGEN_HAS_FP16_C)
return _cvtsh_ss(h.x);
#else
const FP32 magic = { 113 << 23 };
const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift
FP32 o;
o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
unsigned int exp = shifted_exp & o.u; // just the exponent
o.u += (127 - 15) << 23; // exponent adjust
// handle exponent special cases
if (exp == shifted_exp) { // Inf/NaN?
o.u += (128 - 16) << 23; // extra exp adjust
} else if (exp == 0) { // Zero/Denormal?
o.u += 1 << 23; // extra exp adjust
o.f -= magic.f; // renormalize
}
o.u |= (h.x & 0x8000) << 16; // sign bit
return o.f;
#endif
}
// --- standard functions ---
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
return (a.x & 0x7fff) == 0x7c00;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
return __hisnan(a);
#else
return (a.x & 0x7fff) > 0x7c00;
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isfinite)(const half& a) {
return !(isinf EIGEN_NOT_A_MACRO (a)) && !(isnan EIGEN_NOT_A_MACRO (a));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
half result;
result.x = a.x & 0x7FFF;
return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hexp(a));
#else
return half(::expf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
return half(numext::expm1(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hlog(a));
#else
return half(::logf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) {
return half(numext::log1p(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
return half(::log10f(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hsqrt(a));
#else
return half(::sqrtf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
return half(::powf(float(a), float(b)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) {
return half(::sinf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) {
return half(::cosf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) {
return half(::tanf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
return half(::tanhf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hfloor(a));
#else
return half(::floorf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
return half(hceil(a));
#else
return half(::ceilf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
return __hlt(b, a) ? b : a;
#else
const float f1 = static_cast<float>(a);
const float f2 = static_cast<float>(b);
return f2 < f1 ? b : a;
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
return __hlt(a, b) ? b : a;
#else
const float f1 = static_cast<float>(a);
const float f2 = static_cast<float>(b);
return f1 < f2 ? b : a;
#endif
}
EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) {
os << static_cast<float>(v);
return os;
}
} // end namespace half_impl
// import Eigen::half_impl::half into Eigen namespace
// using half_impl::half;
namespace internal {
template<>
struct random_default_impl<half, false, false>
{
static inline half run(const half& x, const half& y)
{
return x + (y-x) * half(float(std::rand()) / float(RAND_MAX));
}
static inline half run()
{
return run(half(-1.f), half(1.f));
}
};
template<> struct is_arithmetic<half> { enum { value = true }; };
} // end namespace internal
template<> struct NumTraits<Eigen::half>
: GenericNumTraits<Eigen::half>
{
enum {
IsSigned = true,
IsInteger = false,
IsComplex = false,
RequireInitialization = false
};
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
return half_impl::raw_uint16_to_half(0x0800);
}
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { return Eigen::half(1e-2f); }
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() {
return half_impl::raw_uint16_to_half(0x7bff);
}
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() {
return half_impl::raw_uint16_to_half(0xfbff);
}
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() {
return half_impl::raw_uint16_to_half(0x7c00);
}
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
return half_impl::raw_uint16_to_half(0x7c01);
}
};
} // end namespace Eigen
// C-like standard mathematical functions and trancendentals.
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) {
Eigen::half result;
result.x = a.x & 0x7FFF;
return result;
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
return Eigen::half(::expf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
return Eigen::half(hlog(a));
#else
return Eigen::half(::logf(float(a)));
#endif
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) {
return Eigen::half(::sqrtf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, const Eigen::half& b) {
return Eigen::half(::powf(float(a), float(b)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) {
return Eigen::half(::floorf(float(a)));
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) {
return Eigen::half(::ceilf(float(a)));
}
namespace std {
#if __cplusplus > 199711L
template <>
struct hash<Eigen::half> {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const {
return static_cast<std::size_t>(a.x);
}
};
#endif
} // end namespace std
// Add the missing shfl_xor intrinsic
#if defined(EIGEN_HAS_HIP_FP16) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
// FIXME
//return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
return var;
}
#endif
// ldg() has an overload for __half, but we also need one for Eigen::half.
#if defined(EIGEN_HAS_HIP_FP16) && \
defined(__HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__) && defined(__HIP_ARCH_HAS_DYNAMIC_PARALLEL__)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
// FIXME
//return Eigen::half_impl::raw_uint16_to_half(
// __ldg(reinterpret_cast<const unsigned short*>(ptr)));
return *ptr;
}
#endif
#if defined(EIGEN_HIP_DEVICE_COMPILE)
namespace Eigen {
namespace numext {
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
bool (isnan)(const Eigen::half& h) {
return (half_impl::isnan)(h);
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
bool (isinf)(const Eigen::half& h) {
return (half_impl::isinf)(h);
}
template<>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
bool (isfinite)(const Eigen::half& h) {
return (half_impl::isfinite)(h);
}
} // namespace Eigen
} // namespace numext
#endif
#endif // EIGEN_HALF_HIP_H

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,212 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_TYPE_CASTING_HIP_H
#define EIGEN_TYPE_CASTING_HIP_H
namespace Eigen {
namespace internal {
template<>
struct scalar_cast_op<float, Eigen::half> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef Eigen::half result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
return __float2half(a);
#else
return Eigen::half(a);
#endif
}
};
template<>
struct functor_traits<scalar_cast_op<float, Eigen::half> >
{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
template<>
struct scalar_cast_op<int, Eigen::half> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef Eigen::half result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
return __float2half(static_cast<float>(a));
#else
return Eigen::half(static_cast<float>(a));
#endif
}
};
template<>
struct functor_traits<scalar_cast_op<int, Eigen::half> >
{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
template<>
struct scalar_cast_op<Eigen::half, float> {
EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
typedef float result_type;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
return __half2float(a);
#else
return static_cast<float>(a);
#endif
}
};
template<>
struct functor_traits<scalar_cast_op<Eigen::half, float> >
{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
#if defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)
template <>
struct type_casting_traits<Eigen::half, float> {
enum {
VectorizedCast = 1,
SrcCoeffRatio = 2,
TgtCoeffRatio = 1
};
};
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
float2 r1 = __half22float2(a);
float2 r2 = __half22float2(b);
return make_float4(r1.x, r1.y, r2.x, r2.y);
}
template <>
struct type_casting_traits<float, Eigen::half> {
enum {
VectorizedCast = 1,
SrcCoeffRatio = 1,
TgtCoeffRatio = 2
};
};
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
// Simply discard the second half of the input
return __floats2half2_rn(a.x, a.y);
}
#elif defined EIGEN_VECTORIZE_AVX512
template <>
struct type_casting_traits<half, float> {
enum {
VectorizedCast = 1,
SrcCoeffRatio = 1,
TgtCoeffRatio = 1
};
};
template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
return half2float(a);
}
template <>
struct type_casting_traits<float, half> {
enum {
VectorizedCast = 1,
SrcCoeffRatio = 1,
TgtCoeffRatio = 1
};
};
template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
return float2half(a);
}
#elif defined EIGEN_VECTORIZE_AVX
template <>
struct type_casting_traits<Eigen::half, float> {
enum {
VectorizedCast = 1,
SrcCoeffRatio = 1,
TgtCoeffRatio = 1
};
};
template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
return half2float(a);
}
template <>
struct type_casting_traits<float, Eigen::half> {
enum {
VectorizedCast = 1,
SrcCoeffRatio = 1,
TgtCoeffRatio = 1
};
};
template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
return float2half(a);
}
// Disable the following code since it's broken on too many platforms / compilers.
//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
#elif 0
template <>
struct type_casting_traits<Eigen::half, float> {
enum {
VectorizedCast = 1,
SrcCoeffRatio = 1,
TgtCoeffRatio = 1
};
};
template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) {
__int64_t a64 = _mm_cvtm64_si64(a.x);
Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64));
float f1 = static_cast<float>(h);
h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
float f2 = static_cast<float>(h);
h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
float f3 = static_cast<float>(h);
h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
float f4 = static_cast<float>(h);
return _mm_set_ps(f4, f3, f2, f1);
}
template <>
struct type_casting_traits<float, Eigen::half> {
enum {
VectorizedCast = 1,
SrcCoeffRatio = 1,
TgtCoeffRatio = 1
};
};
template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) {
EIGEN_ALIGN16 float aux[4];
pstore(aux, a);
Eigen::half h0(aux[0]);
Eigen::half h1(aux[1]);
Eigen::half h2(aux[2]);
Eigen::half h3(aux[3]);
Packet4h result;
result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x);
return result;
}
#endif
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_TYPE_CASTING_HIP_H

View File

@ -0,0 +1,23 @@
/*
* math_constants.h -
* HIP equivalent of the CUDA header of the same name
*/
#ifndef __MATH_CONSTANTS_H__
#define __MATH_CONSTANTS_H__
/* single precision constants */
#define HIPRT_INF_F __int_as_float(0x7f800000)
#define HIPRT_NAN_F __int_as_float(0x7fffffff)
#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001)
#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff)
#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000)
#define HIPRT_ZERO_F 0.0f
#define HIPRT_ONE_F 1.0f
/* double precision constants */
#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000)
#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000)
#endif

View File

@ -436,6 +436,9 @@ template<typename BinaryOp> struct bind1st_op : BinaryOp {
typedef typename BinaryOp::second_argument_type second_argument_type;
typedef typename BinaryOp::result_type result_type;
#if defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC explicit
#endif
bind1st_op(const first_argument_type &val) : m_value(val) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
@ -455,6 +458,9 @@ template<typename BinaryOp> struct bind2nd_op : BinaryOp {
typedef typename BinaryOp::second_argument_type second_argument_type;
typedef typename BinaryOp::result_type result_type;
#if defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC explicit
#endif
bind2nd_op(const second_argument_type &val) : m_value(val) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }

View File

@ -163,7 +163,10 @@ class BlasLinearMapper {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
#if !defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
EIGEN_ALWAYS_INLINE void prefetch(int i) const {
internal::prefetch(&operator()(i));
}

View File

@ -1008,9 +1008,12 @@ namespace Eigen {
# define EIGEN_TRY try
# define EIGEN_CATCH(X) catch (X)
#else
# ifdef EIGEN_CUDA_ARCH
# if defined(EIGEN_CUDA_ARCH)
# define EIGEN_THROW_X(X) asm("trap;")
# define EIGEN_THROW asm("trap;")
# elif defined(EIGEN_HIP_DEVICE_COMPILE)
# define EIGEN_THROW_X(X) asm("s_trap 0")
# define EIGEN_THROW asm("s_trap 0")
# else
# define EIGEN_THROW_X(X) std::abort()
# define EIGEN_THROW std::abort()

View File

@ -70,7 +70,20 @@ inline void throw_std_bad_alloc()
throw std::bad_alloc();
#else
std::size_t huge = static_cast<std::size_t>(-1);
#if defined(EIGEN_HIPCC)
//
// calls to "::operator new" are to be treated as opaque function calls (i.e no inlining),
// and as a consequence the code in the #else block triggers the hipcc warning :
// "no overloaded function has restriction specifiers that are compatible with the ambient context"
//
// "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects
// the same on "operator new"
// Reverting code back to the old version in this #if block for the hipcc compiler
//
new int[huge];
#else
::operator new(huge);
#endif
#endif
}
@ -156,7 +169,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
void *result;
#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
#if defined(EIGEN_HIP_DEVICE_COMPILE)
result = aligned_malloc(size);
#else
result = std::malloc(size);
#endif
#if EIGEN_DEFAULT_ALIGN_BYTES==16
eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator.");
#endif
@ -174,7 +193,13 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
{
#if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
#if defined(EIGEN_HIP_DEVICE_COMPILE)
aligned_free(ptr);
#else
std::free(ptr);
#endif
#else
handmade_aligned_free(ptr);
#endif
@ -218,7 +243,12 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std:
{
check_that_malloc_is_allowed();
#if defined(EIGEN_HIP_DEVICE_COMPILE)
void *result = aligned_malloc(size);
#else
void *result = std::malloc(size);
#endif
if(!result && size)
throw_std_bad_alloc();
return result;
@ -232,7 +262,11 @@ template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void
template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
{
#if defined(EIGEN_HIP_DEVICE_COMPILE)
aligned_free(ptr);
#else
std::free(ptr);
#endif
}
template<bool Align> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size)
@ -493,7 +527,11 @@ template<typename T> struct smart_copy_helper<T,true> {
IntPtr size = IntPtr(end)-IntPtr(start);
if(size==0) return;
eigen_internal_assert(start!=0 && end!=0 && target!=0);
#if defined(EIGEN_HIP_DEVICE_COMPILE)
::memcpy(target, start, size);
#else
std::memcpy(target, start, size);
#endif
}
};

View File

@ -16,6 +16,12 @@
#include <math_constants.h>
#endif
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#include <cfloat>
#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
#endif
#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L
#include <cstdint>
#endif
@ -175,7 +181,7 @@ template<bool Condition, typename T=void> struct enable_if;
template<typename T> struct enable_if<true,T>
{ typedef T type; };
#if defined(EIGEN_CUDA_ARCH)
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
#if !defined(__FLT_EPSILON__)
#define __FLT_EPSILON__ FLT_EPSILON
#define __DBL_EPSILON__ DBL_EPSILON
@ -197,13 +203,31 @@ template<> struct numeric_limits<float>
EIGEN_DEVICE_FUNC
static float epsilon() { return __FLT_EPSILON__; }
EIGEN_DEVICE_FUNC
static float (max)() { return CUDART_MAX_NORMAL_F; }
static float (max)() {
#if defined(EIGEN_CUDA_ARCH)
return CUDART_MAX_NORMAL_F;
#else
return HIPRT_MAX_NORMAL_F;
#endif
}
EIGEN_DEVICE_FUNC
static float (min)() { return FLT_MIN; }
EIGEN_DEVICE_FUNC
static float infinity() { return CUDART_INF_F; }
static float infinity() {
#if defined(EIGEN_CUDA_ARCH)
return CUDART_INF_F;
#else
return HIPRT_INF_F;
#endif
}
EIGEN_DEVICE_FUNC
static float quiet_NaN() { return CUDART_NAN_F; }
static float quiet_NaN() {
#if defined(EIGEN_CUDA_ARCH)
return CUDART_NAN_F;
#else
return HIPRT_NAN_F;
#endif
}
};
template<> struct numeric_limits<double>
{
@ -214,9 +238,21 @@ template<> struct numeric_limits<double>
EIGEN_DEVICE_FUNC
static double (min)() { return DBL_MIN; }
EIGEN_DEVICE_FUNC
static double infinity() { return CUDART_INF; }
static double infinity() {
#if defined(EIGEN_CUDA_ARCH)
return CUDART_INF;
#else
return HIPRT_INF;
#endif
}
EIGEN_DEVICE_FUNC
static double quiet_NaN() { return CUDART_NAN; }
static double quiet_NaN() {
#if defined(EIGEN_CUDA_ARCH)
return CUDART_NAN;
#else
return HIPRT_NAN;
#endif
}
};
template<> struct numeric_limits<int>
{
@ -529,13 +565,13 @@ template<typename T, typename U> struct scalar_product_traits
namespace numext {
#if defined(EIGEN_CUDA_ARCH)
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
template<typename T> EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
#else
template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
#endif
#if defined(EIGEN_CUDA_ARCH)
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
using internal::device::numeric_limits;
#else
using std::numeric_limits;

View File

@ -354,6 +354,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
static const int m_maxIterations = 30;
protected:
EIGEN_DEVICE_FUNC
static void check_template_parameters()
{
EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);

View File

@ -1299,7 +1299,7 @@ void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol,
#endif
}//end deflation
#ifndef EIGEN_CUDACC
#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)
/** \svd_module
*
* \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm

View File

@ -19,7 +19,9 @@ macro(ei_add_test_internal testname testname_with_suffix)
endif()
if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu)
if(EIGEN_TEST_CUDA_CLANG)
if(EIGEN_TEST_HIP)
hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP")
elseif(EIGEN_TEST_CUDA_CLANG)
set_source_files_properties(${filename} PROPERTIES LANGUAGE CXX)
if(CUDA_64_BIT_DEVICE_CODE)
link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
@ -491,6 +493,11 @@ macro(ei_testing_print_summary)
else()
message(STATUS "CUDA: OFF")
endif()
if(EIGEN_TEST_HIP)
message(STATUS "HIP: ON (using hipcc)")
else()
message(STATUS "HIP: OFF")
endif()
endif() # vectorization / alignment options

View File

@ -407,6 +407,48 @@ endif(CUDA_FOUND)
endif(EIGEN_TEST_CUDA)
# HIP unit tests
option(EIGEN_TEST_HIP "Add HIP support." OFF)
if (EIGEN_TEST_HIP)
set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
if (EXISTS ${HIP_PATH})
list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
find_package(HIP REQUIRED)
if (HIP_FOUND)
execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
if (${HIP_PLATFORM} STREQUAL "hcc")
include_directories(${CMAKE_CURRENT_BINARY_DIR})
include_directories(${HIP_PATH}/include)
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
ei_add_test(hip_basic)
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
elseif (${HIP_PLATFORM} STREQUAL "nvcc")
message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
else ()
message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
endif()
endif(HIP_FOUND)
else ()
message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
endif()
endif(EIGEN_TEST_HIP)
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests)
add_test(NAME failtests WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/failtests COMMAND ${CMAKE_COMMAND} ${Eigen_SOURCE_DIR} -G "${CMAKE_GENERATOR}" -DEIGEN_FAILTEST=ON)

172
test/hip_basic.cu Normal file
View File

@ -0,0 +1,172 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// workaround issue between gcc >= 4.7 and cuda 5.5
#if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7)
#undef _GLIBCXX_ATOMIC_BUILTINS
#undef _GLIBCXX_USE_INT128
#endif
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC hip_basic
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#include <hip/hip_runtime.h>
#include "main.h"
#include "hip_common.h"
// Check that dense modules can be properly parsed by hipcc
#include <Eigen/Dense>
// struct Foo{
// EIGEN_DEVICE_FUNC
// void operator()(int i, const float* mats, float* vecs) const {
// using namespace Eigen;
// // Matrix3f M(data);
// // Vector3f x(data+9);
// // Map<Vector3f>(data+9) = M.inverse() * x;
// Matrix3f M(mats+i/16);
// Vector3f x(vecs+i*3);
// // using std::min;
// // using std::sqrt;
// Map<Vector3f>(vecs+i*3) << x.minCoeff(), 1, 2;// / x.dot(x);//(M.inverse() * x) / x.x();
// //x = x*2 + x.y() * x + x * x.maxCoeff() - x / x.sum();
// }
// };
template<typename T>
struct coeff_wise {
EIGEN_DEVICE_FUNC
void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
{
using namespace Eigen;
T x1(in+i);
T x2(in+i+1);
T x3(in+i+2);
Map<T> res(out+i*T::MaxSizeAtCompileTime);
res.array() += (in[0] * x1 + x2).array() * x3.array();
}
};
template<typename T>
struct replicate {
EIGEN_DEVICE_FUNC
void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
{
using namespace Eigen;
T x1(in+i);
int step = x1.size() * 4;
int stride = 3 * step;
typedef Map<Array<typename T::Scalar,Dynamic,Dynamic> > MapType;
MapType(out+i*stride+0*step, x1.rows()*2, x1.cols()*2) = x1.replicate(2,2);
MapType(out+i*stride+1*step, x1.rows()*3, x1.cols()) = in[i] * x1.colwise().replicate(3);
MapType(out+i*stride+2*step, x1.rows(), x1.cols()*3) = in[i] * x1.rowwise().replicate(3);
}
};
template<typename T>
struct redux {
EIGEN_DEVICE_FUNC
void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
{
using namespace Eigen;
int N = 10;
T x1(in+i);
out[i*N+0] = x1.minCoeff();
out[i*N+1] = x1.maxCoeff();
out[i*N+2] = x1.sum();
out[i*N+3] = x1.prod();
out[i*N+4] = x1.matrix().squaredNorm();
out[i*N+5] = x1.matrix().norm();
out[i*N+6] = x1.colwise().sum().maxCoeff();
out[i*N+7] = x1.rowwise().maxCoeff().sum();
out[i*N+8] = x1.matrix().colwise().squaredNorm().sum();
}
};
template<typename T1, typename T2>
struct prod_test {
EIGEN_DEVICE_FUNC
void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
{
using namespace Eigen;
typedef Matrix<typename T1::Scalar, T1::RowsAtCompileTime, T2::ColsAtCompileTime> T3;
T1 x1(in+i);
T2 x2(in+i+1);
Map<T3> res(out+i*T3::MaxSizeAtCompileTime);
res += in[i] * x1 * x2;
}
};
template<typename T1, typename T2>
struct diagonal {
EIGEN_DEVICE_FUNC
void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
{
using namespace Eigen;
T1 x1(in+i);
Map<T2> res(out+i*T2::MaxSizeAtCompileTime);
res += x1.diagonal();
}
};
template<typename T>
struct eigenvalues {
EIGEN_DEVICE_FUNC
void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
{
using namespace Eigen;
typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
T M(in+i);
Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
T A = M*M.adjoint();
SelfAdjointEigenSolver<T> eig;
eig.computeDirect(M);
res = eig.eigenvalues();
}
};
void test_hip_basic()
{
ei_test_init_hip();
int nthreads = 100;
Eigen::VectorXf in, out;
#ifndef __HIP_DEVICE_COMPILE__
int data_size = nthreads * 512;
in.setRandom(data_size);
out.setRandom(data_size);
#endif
CALL_SUBTEST( run_and_compare_to_hip(coeff_wise<Vector3f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_hip(coeff_wise<Array44f>(), nthreads, in, out) );
// FIXME compile fails when we uncomment the followig two tests
// CALL_SUBTEST( run_and_compare_to_hip(replicate<Array4f>(), nthreads, in, out) );
// CALL_SUBTEST( run_and_compare_to_hip(replicate<Array33f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_hip(redux<Array4f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_hip(redux<Matrix3f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_hip(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_hip(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_hip(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_hip(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
// FIXME : Runtime failure occurs when we uncomment the following two tests
// CALL_SUBTEST( run_and_compare_to_hip(eigenvalues<Matrix3f>(), nthreads, in, out) );
// CALL_SUBTEST( run_and_compare_to_hip(eigenvalues<Matrix2f>(), nthreads, in, out) );
}

103
test/hip_common.h Normal file
View File

@ -0,0 +1,103 @@
#ifndef EIGEN_TEST_HIP_COMMON_H
#define EIGEN_TEST_HIP_COMMON_H
#include "hip/hip_runtime.h"
#include "hip/hip_runtime_api.h"
#include <iostream>
#ifndef __HIPCC__
dim3 threadIdx, blockDim, blockIdx;
#endif
template<typename Kernel, typename Input, typename Output>
void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out)
{
for(int i=0; i<n; i++)
ker(i, in.data(), out.data());
}
template<typename Kernel, typename Input, typename Output>
__global__ __attribute__((used))
void run_on_hip_meta_kernel(const Kernel ker, int n, const Input* in, Output* out)
{
int i = hipThreadIdx_x + hipBlockIdx_x*hipBlockDim_x;
if(i<n) {
ker(i, in, out);
}
}
template<typename Kernel, typename Input, typename Output>
void run_on_hip(const Kernel& ker, int n, const Input& in, Output& out)
{
typename Input::Scalar* d_in;
typename Output::Scalar* d_out;
std::ptrdiff_t in_bytes = in.size() * sizeof(typename Input::Scalar);
std::ptrdiff_t out_bytes = out.size() * sizeof(typename Output::Scalar);
hipMalloc((void**)(&d_in), in_bytes);
hipMalloc((void**)(&d_out), out_bytes);
hipMemcpy(d_in, in.data(), in_bytes, hipMemcpyHostToDevice);
hipMemcpy(d_out, out.data(), out_bytes, hipMemcpyHostToDevice);
// Simple and non-optimal 1D mapping assuming n is not too large
// That's only for unit testing!
dim3 Blocks(128);
dim3 Grids( (n+int(Blocks.x)-1)/int(Blocks.x) );
hipDeviceSynchronize();
hipLaunchKernelGGL(HIP_KERNEL_NAME(run_on_hip_meta_kernel<Kernel,
typename std::decay<decltype(*d_in)>::type,
typename std::decay<decltype(*d_out)>::type>),
dim3(Grids), dim3(Blocks), 0, 0, ker, n, d_in, d_out);
hipDeviceSynchronize();
// check inputs have not been modified
hipMemcpy(const_cast<typename Input::Scalar*>(in.data()), d_in, in_bytes, hipMemcpyDeviceToHost);
hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost);
hipFree(d_in);
hipFree(d_out);
}
template<typename Kernel, typename Input, typename Output>
void run_and_compare_to_hip(const Kernel& ker, int n, const Input& in, Output& out)
{
Input in_ref, in_hip;
Output out_ref, out_hip;
#ifndef __HIP_DEVICE_COMPILE__
in_ref = in_hip = in;
out_ref = out_hip = out;
#endif
run_on_cpu (ker, n, in_ref, out_ref);
run_on_hip(ker, n, in_hip, out_hip);
#ifndef __HIP_DEVICE_COMPILE__
VERIFY_IS_APPROX(in_ref, in_hip);
VERIFY_IS_APPROX(out_ref, out_hip);
#endif
}
void ei_test_init_hip()
{
int device = 0;
hipDeviceProp_t deviceProp;
hipGetDeviceProperties(&deviceProp, device);
std::cout << "HIP device info:\n";
std::cout << " name: " << deviceProp.name << "\n";
std::cout << " capability: " << deviceProp.major << "." << deviceProp.minor << "\n";
std::cout << " multiProcessorCount: " << deviceProp.multiProcessorCount << "\n";
std::cout << " maxThreadsPerMultiProcessor: " << deviceProp.maxThreadsPerMultiProcessor << "\n";
std::cout << " warpSize: " << deviceProp.warpSize << "\n";
std::cout << " regsPerBlock: " << deviceProp.regsPerBlock << "\n";
std::cout << " concurrentKernels: " << deviceProp.concurrentKernels << "\n";
std::cout << " clockRate: " << deviceProp.clockRate << "\n";
std::cout << " canMapHostMemory: " << deviceProp.canMapHostMemory << "\n";
std::cout << " computeMode: " << deviceProp.computeMode << "\n";
}
#endif // EIGEN_TEST_HIP_COMMON_H

View File

@ -67,11 +67,17 @@
// protected by parenthesis against macro expansion, the min()/max() macros
// are defined here and any not-parenthesized min/max call will cause a
// compiler error.
#define min(A,B) please_protect_your_min_with_parentheses
#define max(A,B) please_protect_your_max_with_parentheses
#define isnan(X) please_protect_your_isnan_with_parentheses
#define isinf(X) please_protect_your_isinf_with_parentheses
#define isfinite(X) please_protect_your_isfinite_with_parentheses
#if !defined(__HIPCC__)
// HIP headers include the <thread> header which contains not-parenthesized
// calls to "max", triggering the following check and causing the compile to fail
// so disabling the following checks for HIP
#define min(A,B) please_protect_your_min_with_parentheses
#define max(A,B) please_protect_your_max_with_parentheses
#define isnan(X) please_protect_your_isnan_with_parentheses
#define isinf(X) please_protect_your_isinf_with_parentheses
#define isfinite(X) please_protect_your_isfinite_with_parentheses
#endif
#ifdef M_PI
#undef M_PI
#endif
@ -154,7 +160,7 @@ namespace Eigen
#define EIGEN_DEFAULT_IO_FORMAT IOFormat(4, 0, " ", "\n", "", "", "", "")
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__)
#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
#define EIGEN_EXCEPTIONS
#endif
@ -233,7 +239,7 @@ namespace Eigen
}
#endif //EIGEN_EXCEPTIONS
#elif !defined(__CUDACC__) // EIGEN_DEBUG_ASSERTS
#elif !defined(__CUDACC__) && !defined(__HIPCC__)// EIGEN_DEBUG_ASSERTS
// see bug 89. The copy_bool here is working around a bug in gcc <= 4.3
#define eigen_assert(a) \
if( (!Eigen::internal::copy_bool(a)) && (!no_more_assert) )\
@ -290,7 +296,7 @@ namespace Eigen
std::cout << "Can't VERIFY_RAISES_STATIC_ASSERT( " #a " ) with exceptions disabled\n";
#endif
#if !defined(__CUDACC__)
#if !defined(__CUDACC__) && !defined(__HIPCC__)
#define EIGEN_USE_CUSTOM_ASSERT
#endif

View File

@ -80,12 +80,16 @@ typedef unsigned __int64 uint64_t;
#endif
#ifdef EIGEN_USE_GPU
#include <iostream>
#include <cuda_runtime.h>
#if __cplusplus >= 201103L
#include <atomic>
#include <unistd.h>
#endif
#include <iostream>
#if defined(EIGEN_USE_HIP)
#include <hip/hip_runtime.h>
#else
#include <cuda_runtime.h>
#endif
#if __cplusplus >= 201103L
#include <atomic>
#include <unistd.h>
#endif
#endif
#include "src/Tensor/TensorMacros.h"
@ -95,7 +99,11 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorCostModel.h"
#include "src/Tensor/TensorDeviceDefault.h"
#include "src/Tensor/TensorDeviceThreadPool.h"
#include "src/Tensor/TensorDeviceCuda.h"
#if defined(EIGEN_USE_HIP)
#include "src/Tensor/TensorDeviceHip.h"
#else
#include "src/Tensor/TensorDeviceCuda.h"
#endif
#include "src/Tensor/TensorDeviceSycl.h"
#include "src/Tensor/TensorIndexList.h"
#include "src/Tensor/TensorDimensionList.h"
@ -112,16 +120,28 @@ typedef unsigned __int64 uint64_t;
#include "src/Tensor/TensorEvaluator.h"
#include "src/Tensor/TensorExpr.h"
#include "src/Tensor/TensorReduction.h"
#include "src/Tensor/TensorReductionCuda.h"
#if defined(EIGEN_USE_HIP)
#include "src/Tensor/TensorReductionHip.h"
#else
#include "src/Tensor/TensorReductionCuda.h"
#endif
#include "src/Tensor/TensorArgMax.h"
#include "src/Tensor/TensorConcatenation.h"
#include "src/Tensor/TensorContractionMapper.h"
#include "src/Tensor/TensorContractionBlocking.h"
#include "src/Tensor/TensorContraction.h"
#include "src/Tensor/TensorContractionThreadPool.h"
#include "src/Tensor/TensorContractionCuda.h"
#if defined(EIGEN_USE_HIP)
#include "src/Tensor/TensorContractionHip.h"
#else
#include "src/Tensor/TensorContractionCuda.h"
#endif
#include "src/Tensor/TensorConversion.h"
#include "src/Tensor/TensorConvolution.h"
#if defined(EIGEN_USE_HIP)
#include "src/Tensor/TensorConvolutionHip.h"
#else
#include "src/Tensor/TensorConvolution.h"
#endif
#include "src/Tensor/TensorFFT.h"
#include "src/Tensor/TensorPatch.h"
#include "src/Tensor/TensorImagePatch.h"

View File

@ -448,7 +448,10 @@ struct TensorContractionEvaluatorBase
}
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const {
#if !defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
void evalGemv(Scalar* buffer) const {
const Index rows = m_i_size;
const Index cols = m_k_size;
@ -489,7 +492,10 @@ struct TensorContractionEvaluatorBase
}
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
#if !defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
void evalGemm(Scalar* buffer) const {
#if defined(EIGEN_VECTORIZE_AVX) && defined(EIGEN_USE_LIBXSMM)
if (m_can_use_xsmm) {
evalGemmXSMM(buffer);

View File

@ -28,7 +28,10 @@ class TensorContractionBlocking {
typedef typename LhsMapper::Scalar LhsScalar;
typedef typename RhsMapper::Scalar RhsScalar;
EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
#if !defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
kc_(k), mc_(m), nc_(n)
{
if (ShardingType == ShardByCol) {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -35,9 +35,12 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
#ifndef EIGEN_CUDA_ARCH
#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
// Running on the host CPU
return 1;
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
// Running on a HIP device
return 64;
#else
// Running on a CUDA device
return 32;
@ -45,7 +48,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE)
// Running on the host CPU
return l1CacheSize();
#else
@ -55,7 +58,7 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__)
#if !defined(EIGEN_CUDA_ARCH) && !defined(__SYCL_DEVICE_ONLY__) && !defined(EIGEN_HIP_DEVICE_COMPILE)
// Running single threaded on the host CPU
return l3CacheSize();
#else
@ -65,10 +68,14 @@ struct DefaultDevice {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
#ifndef EIGEN_CUDA_ARCH
#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
// Running single threaded on the host CPU
// Should return an enum that encodes the ISA supported by the CPU
return 1;
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
// Running on a HIP device
// return 1 as major for HIP
return 1;
#else
// Running on a CUDA device
return EIGEN_CUDA_ARCH / 100;

View File

@ -0,0 +1,352 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H)
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H
#if defined(EIGEN_HIPCC)
#include "hip/hip_runtime.h"
#include "hip/hip_runtime_api.h"
#endif
#include <unistd.h> //for sleep function
namespace Eigen {
static const int kHipScratchSize = 1024;
// This defines an interface that GPUDevice can take to use
// HIP streams underneath.
class StreamInterface {
public:
virtual ~StreamInterface() {}
virtual const hipStream_t& stream() const = 0;
virtual const hipDeviceProp_t& deviceProperties() const = 0;
// Allocate memory on the actual device where the computation will run
virtual void* allocate(size_t num_bytes) const = 0;
virtual void deallocate(void* buffer) const = 0;
// Return a scratchpad buffer of size 1k
virtual void* scratchpad() const = 0;
// Return a semaphore. The semaphore is initially initialized to 0, and
// each kernel using it is responsible for resetting to 0 upon completion
// to maintain the invariant that the semaphore is always equal to 0 upon
// each kernel start.
virtual unsigned int* semaphore() const = 0;
};
static hipDeviceProp_t* m_deviceProperties;
static bool m_devicePropInitialized = false;
static void initializeDeviceProp() {
if (!m_devicePropInitialized) {
// Attempts to ensure proper behavior in the case of multiple threads
// calling this function simultaneously. This would be trivial to
// implement if we could use std::mutex, but unfortunately mutex don't
// compile with nvcc, so we resort to atomics and thread fences instead.
// Note that if the caller uses a compiler that doesn't support c++11 we
// can't ensure that the initialization is thread safe.
#if 0 && __cplusplus >= 201103L
static std::atomic<bool> first(true);
if (first.exchange(false)) {
#else
static bool first = true;
if (first) {
first = false;
#endif
// We're the first thread to reach this point.
int num_devices;
hipError_t status = hipGetDeviceCount(&num_devices);
if (status != hipSuccess) {
std::cerr << "Failed to get the number of HIP devices: "
<< hipGetErrorString(status)
<< std::endl;
assert(status == hipSuccess);
}
m_deviceProperties = new hipDeviceProp_t[num_devices];
for (int i = 0; i < num_devices; ++i) {
status = hipGetDeviceProperties(&m_deviceProperties[i], i);
if (status != hipSuccess) {
std::cerr << "Failed to initialize HIP device #"
<< i
<< ": "
<< hipGetErrorString(status)
<< std::endl;
assert(status == hipSuccess);
}
}
#if 0 && __cplusplus >= 201103L
std::atomic_thread_fence(std::memory_order_release);
#endif
m_devicePropInitialized = true;
} else {
// Wait for the other thread to inititialize the properties.
while (!m_devicePropInitialized) {
#if 0 && __cplusplus >= 201103L
std::atomic_thread_fence(std::memory_order_acquire);
#endif
sleep(1);
}
}
}
}
static const hipStream_t default_stream = 0x00;//TODO: Use hipStreamDefault instead of 0x00;
class HipStreamDevice : public StreamInterface {
public:
// Use the default stream on the current device
HipStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
hipGetDevice(&device_);
initializeDeviceProp();
}
// Use the default stream on the specified device
HipStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
initializeDeviceProp();
}
// Use the specified stream. Note that it's the
// caller responsibility to ensure that the stream can run on
// the specified device. If no device is specified the code
// assumes that the stream is associated to the current gpu device.
HipStreamDevice(const hipStream_t* stream, int device = -1)
: stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
if (device < 0) {
hipGetDevice(&device_);
} else {
int num_devices;
hipError_t err = hipGetDeviceCount(&num_devices);
EIGEN_UNUSED_VARIABLE(err)
assert(err == hipSuccess);
assert(device < num_devices);
device_ = device;
}
initializeDeviceProp();
}
virtual ~HipStreamDevice() {
if (scratch_) {
deallocate(scratch_);
}
}
const hipStream_t& stream() const { return *stream_; }
const hipDeviceProp_t& deviceProperties() const {
return m_deviceProperties[device_];
}
virtual void* allocate(size_t num_bytes) const {
hipError_t err = hipSetDevice(device_);
EIGEN_UNUSED_VARIABLE(err)
assert(err == hipSuccess);
void* result;
err = hipMalloc(&result, num_bytes);
assert(err == hipSuccess);
assert(result != NULL);
return result;
}
virtual void deallocate(void* buffer) const {
hipError_t err = hipSetDevice(device_);
EIGEN_UNUSED_VARIABLE(err)
assert(err == hipSuccess);
assert(buffer != NULL);
err = hipFree(buffer);
assert(err == hipSuccess);
}
virtual void* scratchpad() const {
if (scratch_ == NULL) {
scratch_ = allocate(kHipScratchSize + sizeof(unsigned int));
}
return scratch_;
}
virtual unsigned int* semaphore() const {
if (semaphore_ == NULL) {
char* scratch = static_cast<char*>(scratchpad()) + kHipScratchSize;
semaphore_ = reinterpret_cast<unsigned int*>(scratch);
//hipError_t err = hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
hipError_t err = hipMemset(semaphore_, 0, sizeof(unsigned int));
EIGEN_UNUSED_VARIABLE(err)
assert(err == hipSuccess);
}
return semaphore_;
}
private:
const hipStream_t* stream_;
int device_;
mutable void* scratch_;
mutable unsigned int* semaphore_;
};
struct GpuDevice {
// The StreamInterface is not owned: the caller is
// responsible for its initialization and eventual destruction.
explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
eigen_assert(stream);
}
explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
eigen_assert(stream);
}
// TODO(bsteiner): This is an internal API, we should not expose it.
EIGEN_STRONG_INLINE const hipStream_t& stream() const {
return stream_->stream();
}
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return stream_->allocate(num_bytes);
}
EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
stream_->deallocate(buffer);
}
EIGEN_STRONG_INLINE void* scratchpad() const {
return stream_->scratchpad();
}
EIGEN_STRONG_INLINE unsigned int* semaphore() const {
return stream_->semaphore();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
#if !defined(EIGEN_HIP_DEVICE_COMPILE)
hipError_t err = hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToDevice,
stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
assert(err == hipSuccess);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
hipError_t err =
hipMemcpyAsync(dst, src, n, hipMemcpyHostToDevice, stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
assert(err == hipSuccess);
}
EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
hipError_t err =
hipMemcpyAsync(dst, src, n, hipMemcpyDeviceToHost, stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
assert(err == hipSuccess);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
#if !defined(EIGEN_HIP_DEVICE_COMPILE)
//TODO:hipError_t err = hipMemsetAsync(buffer, c, n, stream_->stream());
hipError_t err = hipMemset(buffer, c, n);
EIGEN_UNUSED_VARIABLE(err)
assert(err == hipSuccess);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_STRONG_INLINE size_t numThreads() const {
// FIXME
return 32;
}
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
// FIXME
return 48*1024;
}
EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
// We won't try to take advantage of the l2 cache for the time being, and
// there is no l3 cache on hip devices.
return firstLevelCacheSize();
}
// FIXME - this will move into HIP
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#undef assert
#define assert(COND)
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
#if defined(EIGEN_HIPCC) && \
!defined(EIGEN_HIP_DEVICE_COMPILE)
hipError_t err = hipStreamSynchronize(stream_->stream());
if (err != hipSuccess) {
std::cerr << "Error detected in HIP stream: "
<< hipGetErrorString(err)
<< std::endl;
assert(err == hipSuccess);
}
#else
assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_STRONG_INLINE int getNumHipMultiProcessors() const {
return stream_->deviceProperties().multiProcessorCount;
}
EIGEN_STRONG_INLINE int maxHipThreadsPerBlock() const {
return stream_->deviceProperties().maxThreadsPerBlock;
}
EIGEN_STRONG_INLINE int maxHipThreadsPerMultiProcessor() const {
return stream_->deviceProperties().maxThreadsPerMultiProcessor;
}
EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
return stream_->deviceProperties().sharedMemPerBlock;
}
EIGEN_STRONG_INLINE int majorDeviceVersion() const {
return stream_->deviceProperties().major;
}
EIGEN_STRONG_INLINE int minorDeviceVersion() const {
return stream_->deviceProperties().minor;
}
EIGEN_STRONG_INLINE int maxBlocks() const {
return max_blocks_;
}
// This function checks if the HIP runtime recorded an error for the
// underlying stream device.
inline bool ok() const {
#if defined(EIGEN_HIPCC)
hipError_t error = hipStreamQuery(stream_->stream());
return (error == hipSuccess) || (error == hipErrorNotReady);
#else
return false;
#endif
}
private:
const StreamInterface* stream_;
int max_blocks_;
};
#define LAUNCH_HIP_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), (__VA_ARGS__)); \
assert(hipGetLastError() == hipSuccess);
// FIXME: Should be device and kernel specific.
#if defined(EIGEN_HIPCC)
static EIGEN_DEVICE_FUNC inline void setHipSharedMemConfig(hipSharedMemConfig config) {
#if !defined(EIGEN_HIP_DEVICE_COMPILE)
hipError_t status = hipDeviceSetSharedMemConfig(config);
EIGEN_UNUSED_VARIABLE(status)
assert(status == hipSuccess);
#else
EIGEN_UNUSED_VARIABLE(config)
#endif
}
#endif
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_HIP_H

View File

@ -201,7 +201,7 @@ class TensorExecutor<Expression, GpuDevice, Vectorizable> {
};
#if defined(EIGEN_CUDACC)
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
template <typename Evaluator, typename Index, bool Vectorizable>
struct EigenMetaKernelEval {
static __device__ EIGEN_ALWAYS_INLINE
@ -250,6 +250,17 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
#if defined(EIGEN_HIPCC)
const int block_size = device.maxHipThreadsPerBlock();
const int max_blocks = device.getNumHipMultiProcessors() *
device.maxHipThreadsPerMultiProcessor() / block_size;
const Index size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
hipLaunchKernelGGL(HIP_KERNEL_NAME(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
dim3(num_blocks), dim3(block_size), 0, device.stream(), evaluator, size);
#else
const int block_size = device.maxCudaThreadsPerBlock();
const int max_blocks = device.getNumCudaMultiProcessors() *
device.maxCudaThreadsPerMultiProcessor() / block_size;
@ -260,11 +271,12 @@ inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
LAUNCH_CUDA_KERNEL(
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
num_blocks, block_size, 0, device, evaluator, size);
#endif
}
evaluator.cleanup();
}
#endif // EIGEN_CUDACC
#endif // EIGEN_CUDACC || EIGEN_HIPCC
#endif // EIGEN_USE_GPU
// SYCL Executor policy

View File

@ -109,7 +109,10 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
#if !defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
const Index numValues = internal::array_prod(m_impl.dimensions());
m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
// Should initialize the memory in case we're dealing with non POD types.

View File

@ -350,7 +350,11 @@ struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
namespace internal {
template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
template<typename FirstType, typename... OtherTypes>
#if defined(EIGEN_HIPCC)
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
#endif
size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
size_t result = 1;
for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
result *= sizes[i];

View File

@ -27,7 +27,7 @@
*/
// SFINAE requires variadic templates
#ifndef EIGEN_CUDACC
#if !defined(EIGEN_CUDACC) && !defined(EIGEN_HIPCC)
#if EIGEN_HAS_VARIADIC_TEMPLATES
// SFINAE doesn't work for gcc <= 4.7
#ifdef EIGEN_COMP_GNUC

View File

@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits<Scalar> {
};
// For CUDA packet types when using a GpuDevice
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)
#if defined(EIGEN_USE_GPU) && ((defined(EIGEN_CUDACC) && defined(EIGEN_HAS_CUDA_FP16)) || (defined(EIGEN_HIPCC) && defined(EIGEN_HAS_HIP_FP16)))
template <>
struct PacketType<half, GpuDevice> {
typedef half2 type;

View File

@ -858,7 +858,10 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
}
return inputIndex;
}
#if defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
#ifndef __SYCL_DEVICE_ONLY__
return numext::maxi(min, numext::mini(max,value));

View File

@ -16,7 +16,7 @@ namespace internal {
namespace {
EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
#ifdef EIGEN_CUDA_ARCH
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
// We don't support 3d kernels since we currently only use 1 and
// 2d kernels.
assert(threadIdx.z == 0);

View File

@ -334,12 +334,12 @@ struct OuterReducer {
};
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC))
template <int B, int N, typename S, typename R, typename I>
__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
#ifdef EIGEN_HAS_CUDA_FP16
#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
template <typename S, typename R, typename I>
__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
template <int B, int N, typename S, typename R, typename I>
@ -495,7 +495,11 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
EIGEN_STRONG_INLINE
#if !defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
m_impl.evalSubExprsIfNeeded(NULL);
// Use the FullReducer if possible.
@ -694,9 +698,9 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
#ifdef EIGEN_USE_THREADS
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
#endif
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC))
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
#ifdef EIGEN_HAS_CUDA_FP16
#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
template <typename S, typename R, typename I> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
template <int B, int N, typename S, typename R, typename I> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
template <int NPT, typename S, typename R, typename I> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
@ -774,14 +778,22 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
// Indexed by reduced dimensions.
array<Index, NumReducedDims> m_reducedDims;
#if defined(EIGEN_HIPCC)
public:
#endif
// Evaluator for the input expression.
TensorEvaluator<ArgType, Device> m_impl;
#if defined(EIGEN_HIPCC)
private:
#endif
// Operation to apply for computing the reduction.
Op m_reducer;
// For full reductions
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC))
static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
static const bool RunningOnSycl = false;
#elif defined(EIGEN_USE_SYCL)

View File

@ -0,0 +1,815 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H
#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H
#if defined(EIGEN_HIP_DEVICE_COMPILE)
#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
#endif
#if defined(EIGEN_HIPCC)
#define HIP_WARP_SIZE 64
#endif
namespace Eigen {
namespace internal {
#if defined(EIGEN_USE_GPU) && defined(EIGEN_HIPCC)
// Full reducers for GPU, don't vectorize for now
// Reducer function that enables multiple hip thread to safely accumulate at the same
// output address. It basically reads the current value of the output variable, and
// attempts to update it with the new value. If in the meantime another hip thread
// updated the content of the output address it will try again.
template <typename T, typename R>
__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
#if defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
if (sizeof(T) == 4)
{
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
unsigned int newval = oldval;
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
if (newval == oldval) {
return;
}
unsigned int readback;
while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
oldval = readback;
newval = oldval;
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
if (newval == oldval) {
return;
}
}
}
else if (sizeof(T) == 8) {
unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
unsigned long long newval = oldval;
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
if (newval == oldval) {
return;
}
unsigned long long readback;
while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
oldval = readback;
newval = oldval;
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
if (newval == oldval) {
return;
}
}
}
else {
assert(0 && "Wordsize not supported");
}
#else
assert(0 && "Shouldn't be called on unsupported device");
#endif
}
// We extend atomicExch to support extra data types
template <typename Type>
__device__ inline Type atomicExchCustom(Type* address, Type val) {
return atomicExch(address, val);
}
template <>
__device__ inline double atomicExchCustom(double* address, double val) {
unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
}
#if defined(EIGEN_HAS_HIP_FP16)
template <template <typename T> class R>
__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
unsigned int newval = oldval;
reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
if (newval == oldval) {
return;
}
unsigned int readback;
while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
oldval = readback;
newval = oldval;
reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
if (newval == oldval) {
return;
}
}
}
#endif
template <>
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
atomicAdd(output, accum);
#else
assert(0 && "Shouldn't be called on unsupported device");
#endif
}
template <typename CoeffType, typename Index>
__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
const Index num_threads = hipBlockDim_x * hipGridDim_x;
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
output[i] = val;
}
}
template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void FullReductionKernel(const Self input, Index num_coeffs,
typename Self::CoeffReturnType* output, unsigned int* semaphore, Reducer reducer) {
#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
// Initialize the output value
const Index first_index = hipBlockIdx_x * BlockSize * NumPerThread + hipThreadIdx_x;
if (hipGridDim_x == 1) {
if (first_index == 0) {
*output = reducer.initialize();
}
}
else {
if (hipThreadIdx_x == 0) {
unsigned int block = atomicCAS(semaphore, 0u, 1u);
if (block == 0) {
// We're the first block to run, initialize the output value
atomicExchCustom(output, reducer.initialize());
__threadfence();
atomicExch(semaphore, 2u);
}
else {
// Wait for the first block to initialize the output value.
// Use atomicCAS here to ensure that the reads aren't cached
unsigned int val;
do {
val = atomicCAS(semaphore, 2u, 2u);
}
while (val < 2u);
}
}
}
__syncthreads();
eigen_assert(hipGridDim_x == 1 || *semaphore >= 2u);
typename Self::CoeffReturnType accum = reducer.initialize();
Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
for (Index i = 0; i < max_iter; i+=BlockSize) {
const Index index = first_index + i;
eigen_assert(index < num_coeffs);
typename Self::CoeffReturnType val = input.m_impl.coeff(index);
reducer.reduce(val, &accum);
}
#pragma unroll
for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
// XXX use std::is_floating_point to determine the type of accum
if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
reducer.reduce(__shfl_down(static_cast<float>(accum), offset, HIP_WARP_SIZE), &accum);
} else {
reducer.reduce(__shfl_down(static_cast<int>(accum), offset, HIP_WARP_SIZE), &accum);
}
}
if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
atomicReduce(output, accum, reducer);
}
if (hipGridDim_x > 1 && hipThreadIdx_x == 0) {
// Let the last block reset the semaphore
atomicInc(semaphore, hipGridDim_x + 1);
__threadfence_system();
}
#else
assert(0 && "Shouldn't be called on unsupported device");
#endif
}
#if defined(EIGEN_HAS_HIP_FP16)
template <typename Self,
typename Reducer, typename Index>
__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
eigen_assert(hipBlockDim_x == 1);
eigen_assert(hipGridDim_x == 1);
if (num_coeffs % 2 != 0) {
half last = input.m_impl.coeff(num_coeffs-1);
*scratch = __halves2half2(last, reducer.initialize());
} else {
*scratch = reducer.template initializePacket<half2>();
}
}
template <typename Self,
typename Reducer, typename Index>
__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
const Index num_threads = hipBlockDim_x * hipGridDim_x;
const Index num_packets = num_coeffs / 2;
for (Index i = thread_id; i < num_packets; i += num_threads) {
((half2*)output)[i] = reducer.template initializePacket<half2>();
}
if (thread_id == 0 && num_coeffs % 2 != 0) {
output[num_coeffs-1] = reducer.initialize();
}
}
template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
half* output, half2* scratch) {
eigen_assert(NumPerThread % 2 == 0);
const Index first_index = hipBlockIdx_x * BlockSize * NumPerThread + 2*hipThreadIdx_x;
// Initialize the output value if it wasn't initialized by the ReductionInitKernel
if (hipGridDim_x == 1 && first_index == 0) {
if (num_coeffs % 2 != 0) {
half last = input.m_impl.coeff(num_coeffs-1);
*scratch = __halves2half2(last, reducer.initialize());
} else {
*scratch = reducer.template initializePacket<half2>();
}
__syncthreads();
}
half2 accum = reducer.template initializePacket<half2>();
const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
for (Index i = 0; i < max_iter; i += BlockSize) {
const Index index = first_index + 2*i;
eigen_assert(index + 1 < num_coeffs);
half2 val = input.m_impl.template packet<Unaligned>(index);
reducer.reducePacket(val, &accum);
}
#pragma unroll
for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
union { int i; half2 h; } wka_in, wka_out;
wka_in.h = accum;
wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
reducer.reducePacket(wka_out.h, &accum);
}
if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
atomicReduce(scratch, accum, reducer);
}
__syncthreads();
if (hipGridDim_x == 1 && first_index == 0) {
half tmp = __low2half(*scratch);
reducer.reduce(__high2half(*scratch), &tmp);
*output = tmp;
}
}
template <typename Op>
__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
eigen_assert(hipThreadIdx_x == 1);
half tmp = __low2half(*scratch);
reducer.reduce(__high2half(*scratch), &tmp);
*output = tmp;
}
#endif
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct FullReductionLauncher {
static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
assert(false && "Should only be called on doubles, floats and half floats");
}
};
namespace {
std::mutex __eigen_reduction_hip_mutex;
}
// Specialization for float and double
template <typename Self, typename Op, typename OutputType, bool PacketAccess>
struct FullReductionLauncher<
Self, Op, OutputType, PacketAccess,
typename internal::enable_if<
internal::is_same<float, OutputType>::value ||
internal::is_same<double, OutputType>::value,
void>::type> {
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
// guard FullReductionLauncher with a mutex so only 1 FullReductionKernel
// is dispatched at a time
std::lock_guard<std::mutex> lock(__eigen_reduction_hip_mutex);
typedef typename Self::Index Index;
typedef typename Self::CoeffReturnType Scalar;
const int block_size = 256;
const int num_per_thread = 128;
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
unsigned int* semaphore = NULL;
if (num_blocks > 1) {
semaphore = device.semaphore();
unsigned int semaphore_host = 0xFF;
hipMemcpy(&semaphore_host, semaphore, sizeof(unsigned int), hipMemcpyDeviceToHost);
if (semaphore_host != 0) {
std::cerr << "[WARN][EIGEN][FullReductionLauncher] incorrect semaphore value: "
<< semaphore_host << "\n";
// wait for all commands on the device to complete so semaphore value
// is reset to 0
hipDeviceSynchronize();
// read again
hipMemcpy(&semaphore_host, semaphore, sizeof(unsigned int), hipMemcpyDeviceToHost);
if (semaphore_host != 0) {
std::cerr << "[ERROR][EIGEN][FullReductionLauncher] CRITICAL incorrect semaphore value: "
<< semaphore_host << ", apply manual override to 0\n";
// force set semaphore value to be 0
semaphore_host = 0;
hipMemcpy(semaphore, &semaphore_host, sizeof(unsigned int), hipMemcpyHostToDevice);
}
}
}
hipLaunchKernelGGL(HIP_KERNEL_NAME(FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
dim3(num_blocks), dim3(block_size), 0, device.stream(), self, num_coeffs, output, semaphore, reducer);
}
};
#if defined(EIGEN_HAS_HIP_FP16)
template <typename Self, typename Op>
struct FullReductionLauncher<Self, Op, Eigen::half, false> {
static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
assert(false && "Should not be called since there is no packet accessor");
}
};
template <typename Self, typename Op>
struct FullReductionLauncher<Self, Op, Eigen::half, true> {
static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
typedef typename Self::Index Index;
const int block_size = 256;
const int num_per_thread = 128;
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
half2* scratch = static_cast<half2*>(device.scratchpad());
if (num_blocks > 1) {
// We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
// won't be a race conditions between multiple thread blocks.
hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
dim3(1), dim3(1), 0, device.stream(), reducer, self, num_coeffs, scratch);
}
hipLaunchKernelGGL(HIP_KERNEL_NAME(FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs, output, scratch);
if (num_blocks > 1) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionCleanupKernelHalfFloat<Op>),
dim3(1), dim3(1), 0, device.stream(), reducer, output, scratch);
}
}
};
#endif
template <typename Self, typename Op, bool Vectorizable>
struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
// Unfortunately nvidia doesn't support well exotic types such as complex,
// so reduce the scope of the optimized version of the code to the simple cases
// of doubles, floats and half floats
#if defined(EIGEN_HAS_HIP_FP16)
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value);
#endif
template <typename OutputType>
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
const Index num_coeffs = array_prod(self.m_impl.dimensions());
// Don't crash when we're called with an input tensor of size 0.
if (num_coeffs == 0) {
return;
}
FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
}
};
template <int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) {
#if defined(EIGEN_HIP_DEVICE_COMPILE) && (__HIP_DEVICE_COMPILE__ == 1) &&\
defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)
typedef typename Self::CoeffReturnType Type;
eigen_assert(hipBlockDim_y == 1);
eigen_assert(hipBlockDim_z == 1);
eigen_assert(hipGridDim_y == 1);
eigen_assert(hipGridDim_z == 1);
const int unroll_times = 16;
eigen_assert(NumPerThread % unroll_times == 0);
const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, hipBlockDim_x * NumPerThread);
const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
const Index num_threads = hipBlockDim_x * hipGridDim_x;
const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
// Initialize the output values if they weren't initialized by the ReductionInitKernel
if (hipGridDim_x == 1) {
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
output[i] = reducer.initialize();
}
__syncthreads();
}
for (Index i = hipBlockIdx_x; i < num_input_blocks; i += hipGridDim_x) {
const Index row = i / input_col_blocks;
if (row < num_preserved_coeffs) {
const Index col_block = i % input_col_blocks;
const Index col_begin = col_block * hipBlockDim_x * NumPerThread + hipThreadIdx_x;
Type reduced_val = reducer.initialize();
for (Index j = 0; j < NumPerThread; j += unroll_times) {
const Index last_col = col_begin + hipBlockDim_x * (j + unroll_times - 1);
if (last_col >= num_coeffs_to_reduce) {
for (Index col = col_begin + hipBlockDim_x * j; col < num_coeffs_to_reduce; col += hipBlockDim_x) {
const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
reducer.reduce(val, &reduced_val);
}
break;
} else {
// Faster version of the loop with no branches after unrolling.
#pragma unroll
for (int k = 0; k < unroll_times; ++k) {
const Index col = col_begin + hipBlockDim_x * (j + k);
reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
}
}
}
#pragma unroll
for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
// XXX use std::is_floating_point to determine the type of reduced_val
if (std::is_floating_point<Type>::value) {
reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
} else {
reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
}
}
if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
atomicReduce(&(output[row]), reduced_val, reducer);
}
}
}
#else
assert(0 && "Shouldn't be called on unsupported device");
#endif
}
#if defined(EIGEN_HAS_HIP_FP16)
template <int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
half* output) {
eigen_assert(hipBlockDim_y == 1);
eigen_assert(hipBlockDim_z == 1);
eigen_assert(hipGridDim_y == 1);
eigen_assert(hipGridDim_z == 1);
const int unroll_times = 16;
eigen_assert(NumPerThread % unroll_times == 0);
eigen_assert(unroll_times % 2 == 0);
const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, hipBlockDim_x * NumPerThread * 2);
const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
const Index num_threads = hipBlockDim_x * hipGridDim_x;
const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
// Initialize the output values if they weren't initialized by the ReductionInitKernel
if (hipGridDim_x == 1) {
Index i = 2*thread_id;
for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
half* loc = output + i;
*((half2*)loc) = reducer.template initializePacket<half2>();
}
if (i < num_preserved_coeffs) {
output[i] = reducer.initialize();
}
__syncthreads();
}
for (Index i = hipBlockIdx_x; i < num_input_blocks; i += hipGridDim_x) {
const Index row = 2 * (i / input_col_blocks);
if (row + 1 < num_preserved_coeffs) {
const Index col_block = i % input_col_blocks;
const Index col_begin = 2 * (col_block * hipBlockDim_x * NumPerThread + hipThreadIdx_x);
half2 reduced_val1 = reducer.template initializePacket<half2>();
half2 reduced_val2 = reducer.template initializePacket<half2>();
for (Index j = 0; j < NumPerThread; j += unroll_times) {
const Index last_col = col_begin + hipBlockDim_x * (j + unroll_times - 1) * 2;
if (last_col >= num_coeffs_to_reduce) {
Index col = col_begin + hipBlockDim_x * j;
for (; col + 1 < num_coeffs_to_reduce; col += hipBlockDim_x) {
const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
reducer.reducePacket(val1, &reduced_val1);
const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
reducer.reducePacket(val2, &reduced_val2);
}
if (col < num_coeffs_to_reduce) {
// Peel;
const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
const half2 val1 = __halves2half2(last1, reducer.initialize());
reducer.reducePacket(val1, &reduced_val1);
const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
const half2 val2 = __halves2half2(last2, reducer.initialize());
reducer.reducePacket(val2, &reduced_val2);
}
break;
} else {
// Faster version of the loop with no branches after unrolling.
#pragma unroll
for (int k = 0; k < unroll_times; ++k) {
const Index col = col_begin + hipBlockDim_x * (j + k) * 2;
reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
}
}
}
#pragma unroll
for (int offset = HIP_WARP_SIZE/2; offset > 0; offset /= 2) {
// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
union { int i; half2 h; } wka_in, wka_out;
wka_in.h = reduced_val1;
wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
reducer.reducePacket(wka_out.h, &reduced_val1);
wka_in.h = reduced_val2;
wka_out.i = __shfl_down(wka_in.i, offset, HIP_WARP_SIZE);
reducer.reducePacket(wka_out.h, &reduced_val2);
}
half val1 = __low2half(reduced_val1);
reducer.reduce(__high2half(reduced_val1), &val1);
half val2 = __low2half(reduced_val2);
reducer.reduce(__high2half(reduced_val2), &val2);
half2 val = __halves2half2(val1, val2);
if ((hipThreadIdx_x & (HIP_WARP_SIZE - 1)) == 0) {
half* loc = output + row;
atomicReduce((half2*)loc, val, reducer);
}
}
}
}
#endif
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct InnerReductionLauncher {
static bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
return true;
}
};
// Specialization for float and double
template <typename Self, typename Op, typename OutputType, bool PacketAccess>
struct InnerReductionLauncher<
Self, Op, OutputType, PacketAccess,
typename internal::enable_if<
internal::is_same<float, OutputType>::value ||
internal::is_same<double, OutputType>::value,
void>::type> {
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = 256;
const int num_per_thread = 128;
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
const int max_blocks = device.getNumHipMultiProcessors() *
device.maxHipThreadsPerMultiProcessor() / block_size;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
if (num_blocks > 1) {
// We initialize the outputs outside the reduction kernel when we can't be sure that there
// won't be a race conditions between multiple thread blocks.
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
const int max_blocks = device.getNumHipMultiProcessors() *
device.maxHipThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernel<OutputType, Index>),
dim3(num_blocks), dim3(1024), 0, device.stream(),
reducer.initialize(), num_preserved_vals, output);
}
hipLaunchKernelGGL(HIP_KERNEL_NAME(InnerReductionKernel<num_per_thread, Self, Op, Index>),
dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self,
num_coeffs_to_reduce, num_preserved_vals, output);
return false;
}
};
#if defined(EIGEN_HAS_HIP_FP16)
template <typename Self, typename Op>
struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
assert(false && "Should not be called since there is no packet accessor");
return true;
}
};
template <typename Self, typename Op>
struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
if (num_preserved_vals % 2 != 0) {
// Not supported yet, revert to the slower code path
return true;
}
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = /*256*/128;
const int num_per_thread = /*128*/64;
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
const int max_blocks = device.getNumHipMultiProcessors() *
device.maxHipThreadsPerMultiProcessor() / block_size;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
if (num_blocks > 1) {
// We initialize the outputs outside the reduction kernel when we can't be sure that there
// won't be a race conditions between multiple thread blocks.
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
const int max_blocks = device.getNumHipMultiProcessors() *
device.maxHipThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernelHalfFloat<Self, Op, Index>),
dim3(1), dim3(1), 0, device.stream(), reducer, self, num_preserved_vals, output);
}
hipLaunchKernelGGL(HIP_KERNEL_NAME(InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
return false;
}
};
#endif
template <typename Self, typename Op>
struct InnerReducer<Self, Op, GpuDevice> {
// Unfortunately nvidia doesn't support well exotic types such as complex,
// so reduce the scope of the optimized version of the code to the simple case
// of floats and half floats.
#if defined(EIGEN_HAS_HIP_FP16)
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value);
#endif
template <typename OutputType>
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
const Index num_coeffs = array_prod(self.m_impl.dimensions());
// Don't crash when we're called with an input tensor of size 0.
if (num_coeffs == 0) {
return true;
}
// It's faster to use the usual code.
if (num_coeffs_to_reduce <= 128) {
return true;
}
return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
}
};
template <int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) {
const Index num_threads = hipBlockDim_x * hipGridDim_x;
const Index thread_id = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
// Initialize the output values if they weren't initialized by the ReductionInitKernel
if (hipGridDim_x == 1) {
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
output[i] = reducer.initialize();
}
__syncthreads();
}
// Do the reduction.
const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
for (Index i = thread_id; i < max_iter; i += num_threads) {
const Index input_col = i % num_preserved_coeffs;
const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
typename Self::CoeffReturnType reduced_val = reducer.initialize();
const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
for (Index j = input_row; j < max_row; j++) {
typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
reducer.reduce(val, &reduced_val);
}
atomicReduce(&(output[input_col]), reduced_val, reducer);
}
}
template <typename Self, typename Op>
struct OuterReducer<Self, Op, GpuDevice> {
// Unfortunately nvidia doesn't support well exotic types such as complex,
// so reduce the scope of the optimized version of the code to the simple case
// of floats.
static const bool HasOptimizedImplementation = !Op::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value);
template <typename Device, typename OutputType>
static bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
assert(false && "Should only be called to reduce doubles or floats on a gpu device");
return true;
}
static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
// It's faster to use the usual code.
if (num_coeffs_to_reduce <= 32) {
return true;
}
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = 256;
const int num_per_thread = 16;
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
const int max_blocks = device.getNumHipMultiProcessors() *
device.maxHipThreadsPerMultiProcessor() / block_size;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
if (num_blocks > 1) {
// We initialize the outputs in the reduction kernel itself when we don't have to worry
// about race conditions between multiple thread blocks.
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
const int max_blocks = device.getNumHipMultiProcessors() *
device.maxHipThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
hipLaunchKernelGGL(HIP_KERNEL_NAME(ReductionInitKernel<float, Index>),
dim3(num_blocks), dim3(1024), 0, device.stream(),
reducer.initialize(), num_preserved_vals, output);
}
hipLaunchKernelGGL(HIP_KERNEL_NAME(OuterReductionKernel<num_per_thread, Self, Op, Index>),
dim3(num_blocks), dim3(block_size), 0, device.stream(), reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
return false;
}
};
#endif
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_HIP_H

View File

@ -242,7 +242,7 @@ struct ScanLauncher {
}
};
#if defined(EIGEN_USE_GPU) && defined(EIGEN_CUDACC)
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC))
// GPU implementation of scan
// TODO(ibab) This placeholder implementation performs multiple scans in
@ -278,10 +278,15 @@ struct ScanLauncher<Self, Reducer, GpuDevice> {
Index total_size = internal::array_prod(self.dimensions());
Index num_blocks = (total_size / self.size() + 63) / 64;
Index block_size = 64;
#if defined(EIGEN_HIPCC)
hipLaunchKernelGGL(HIP_KERNEL_NAME(ScanKernel<Self, Reducer>), dim3(num_blocks),
dim3(block_size), 0, self.device().stream(), self, total_size, data);
#else
LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
#endif
}
};
#endif // EIGEN_USE_GPU && EIGEN_CUDACC
#endif // EIGEN_USE_GPU && (EIGEN_CUDACC || EIGEN_HIPCC)
} // end namespace Eigen

View File

@ -268,6 +268,9 @@ template<
typename Reducer
> struct reduce<Reducer>
{
#if defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
constexpr static inline int run() { return Reducer::Identity; }
};
@ -276,6 +279,9 @@ template<
typename A
> struct reduce<Reducer, A>
{
#if defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
constexpr static inline A run(A a) { return a; }
};
@ -285,6 +291,9 @@ template<
typename... Ts
> struct reduce<Reducer, A, Ts...>
{
#if defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
}
@ -324,6 +333,9 @@ struct greater_equal_zero_op { template<typename A> constexpr static inline auto
// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
// does...
template<typename... Ts>
#if defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
{
return reduce<product_op, Ts...>::run(ts...);

View File

@ -15,7 +15,7 @@
// The array class is only available starting with cxx11. Emulate our own here
// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_AVOID_STL_ARRAY)
#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) || defined(EIGEN_AVOID_STL_ARRAY)
namespace Eigen {
template <typename T, size_t n> class array {

View File

@ -190,7 +190,7 @@ template <>
struct lgamma_impl<float> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE float run(float x) {
#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) && !defined(EIGEN_HIP_DEVICE_COMPILE)
int dummy;
return ::lgammaf_r(x, &dummy);
#else
@ -203,7 +203,7 @@ template <>
struct lgamma_impl<double> {
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE double run(double x) {
#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
#if !defined(EIGEN_CUDA_ARCH) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__) && !defined(EIGEN_HIP_DEVICE_COMPILE)
int dummy;
return ::lgamma_r(x, &dummy);
#else

View File

@ -297,3 +297,55 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
endif()
# Add HIP specific tests
if (EIGEN_TEST_HIP)
set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
if (EXISTS ${HIP_PATH})
list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
find_package(HIP REQUIRED)
if (HIP_FOUND)
execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM)
if (${HIP_PLATFORM} STREQUAL "hcc")
include_directories(${CMAKE_CURRENT_BINARY_DIR})
include_directories(${HIP_PATH}/include)
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
# ei_add_test(cxx11_tensor_complex_hip)
# ei_add_test(cxx11_tensor_complex_cwise_ops_hip)
ei_add_test(cxx11_tensor_reduction_hip)
ei_add_test(cxx11_tensor_argmax_hip)
ei_add_test(cxx11_tensor_cast_float16_hip)
ei_add_test(cxx11_tensor_scan_hip)
ei_add_test(cxx11_tensor_device_hip)
ei_add_test(cxx11_tensor_hip)
ei_add_test(cxx11_tensor_contract_hip)
ei_add_test(cxx11_tensor_of_float16_hip)
ei_add_test(cxx11_tensor_random_hip)
unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
elseif (${HIP_PLATFORM} STREQUAL "nvcc")
message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
else ()
message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
endif()
endif(HIP_FOUND)
else ()
message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
endif()
endif(EIGEN_TEST_HIP)

View File

@ -0,0 +1,251 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_FUNC cxx11_tensor_hip
#define EIGEN_USE_GPU
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
template <int Layout>
void test_hip_simple_argmax()
{
Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1));
in.setRandom();
in *= in.constant(100.0);
in(0, 0, 0) = -1000.0;
in(71, 52, 96) = 1000.0;
std::size_t in_bytes = in.size() * sizeof(double);
std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
double* d_in;
DenseIndex* d_out_max;
DenseIndex* d_out_min;
hipMalloc((void**)(&d_in), in_bytes);
hipMalloc((void**)(&d_out_max), out_bytes);
hipMalloc((void**)(&d_out_min), out_bytes);
hipMemcpy(d_in, in.data(), in_bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1));
Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1));
gpu_out_max.device(gpu_device) = gpu_in.argmax();
gpu_out_min.device(gpu_device) = gpu_in.argmin();
assert(hipMemcpyAsync(out_max.data(), d_out_max, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipMemcpyAsync(out_min.data(), d_out_min, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
hipFree(d_in);
hipFree(d_out_max);
hipFree(d_out_min);
}
template <int DataLayout>
void test_hip_argmax_dim()
{
Tensor<float, 4, DataLayout> tensor(2,3,5,7);
std::vector<int> dims;
dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
for (int dim = 0; dim < 4; ++dim) {
tensor.setRandom();
tensor = (tensor + tensor.constant(0.5)).log();
array<DenseIndex, 3> out_shape;
for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
array<DenseIndex, 4> ix;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 7; ++l) {
ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
if (ix[dim] != 0) continue;
// suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
tensor(ix) = 10.0;
}
}
}
}
std::size_t in_bytes = tensor.size() * sizeof(float);
std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
float* d_in;
DenseIndex* d_out;
hipMalloc((void**)(&d_in), in_bytes);
hipMalloc((void**)(&d_out), out_bytes);
hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
gpu_out.device(gpu_device) = gpu_in.argmax(dim);
assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
VERIFY_IS_EQUAL(tensor_arg.size(),
size_t(2*3*5*7 / tensor.dimension(dim)));
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
// Expect max to be in the first index of the reduced dimension
VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
}
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 7; ++l) {
ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
if (ix[dim] != tensor.dimension(dim) - 1) continue;
// suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
tensor(ix) = 20.0;
}
}
}
}
hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
gpu_out.device(gpu_device) = gpu_in.argmax(dim);
assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
// Expect max to be in the last index of the reduced dimension
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
}
hipFree(d_in);
hipFree(d_out);
}
}
template <int DataLayout>
void test_hip_argmin_dim()
{
Tensor<float, 4, DataLayout> tensor(2,3,5,7);
std::vector<int> dims;
dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
for (int dim = 0; dim < 4; ++dim) {
tensor.setRandom();
tensor = (tensor + tensor.constant(0.5)).log();
array<DenseIndex, 3> out_shape;
for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
array<DenseIndex, 4> ix;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 7; ++l) {
ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
if (ix[dim] != 0) continue;
// suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
tensor(ix) = -10.0;
}
}
}
}
std::size_t in_bytes = tensor.size() * sizeof(float);
std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
float* d_in;
DenseIndex* d_out;
hipMalloc((void**)(&d_in), in_bytes);
hipMalloc((void**)(&d_out), out_bytes);
hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
gpu_out.device(gpu_device) = gpu_in.argmin(dim);
assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
VERIFY_IS_EQUAL(tensor_arg.size(),
2*3*5*7 / tensor.dimension(dim));
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
// Expect min to be in the first index of the reduced dimension
VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
}
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 3; ++j) {
for (int k = 0; k < 5; ++k) {
for (int l = 0; l < 7; ++l) {
ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
if (ix[dim] != tensor.dimension(dim) - 1) continue;
// suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
tensor(ix) = -20.0;
}
}
}
}
hipMemcpy(d_in, tensor.data(), in_bytes, hipMemcpyHostToDevice);
gpu_out.device(gpu_device) = gpu_in.argmin(dim);
assert(hipMemcpyAsync(tensor_arg.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
// Expect max to be in the last index of the reduced dimension
VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
}
hipFree(d_in);
hipFree(d_out);
}
}
void test_cxx11_tensor_hip()
{
CALL_SUBTEST(test_hip_simple_argmax<RowMajor>());
CALL_SUBTEST(test_hip_simple_argmax<ColMajor>());
CALL_SUBTEST(test_hip_argmax_dim<RowMajor>());
CALL_SUBTEST(test_hip_argmax_dim<ColMajor>());
CALL_SUBTEST(test_hip_argmin_dim<RowMajor>());
CALL_SUBTEST(test_hip_argmin_dim<ColMajor>());
}

View File

@ -0,0 +1,79 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_hip
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
void test_hip_conversion() {
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101;
Tensor<float, 1> floats(num_elem);
floats.setRandom();
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
d_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
d_half, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
d_conv, num_elem);
gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float));
gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
gpu_conv.device(gpu_device) = gpu_half.cast<float>();
Tensor<float, 1> initial(num_elem);
Tensor<float, 1> final(num_elem);
gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
gpu_device.synchronize();
for (int i = 0; i < num_elem; ++i) {
VERIFY_IS_APPROX(initial(i), final(i));
}
gpu_device.deallocate(d_float);
gpu_device.deallocate(d_half);
gpu_device.deallocate(d_conv);
}
void test_fallback_conversion() {
int num_elem = 101;
Tensor<float, 1> floats(num_elem);
floats.setRandom();
Eigen::Tensor<Eigen::half, 1> halfs = floats.cast<Eigen::half>();
Eigen::Tensor<float, 1> conv = halfs.cast<float>();
for (int i = 0; i < num_elem; ++i) {
VERIFY_IS_APPROX(floats(i), conv(i));
}
}
void test_cxx11_tensor_cast_float16_hip()
{
CALL_SUBTEST(test_hip_conversion());
CALL_SUBTEST(test_fallback_conversion());
}

View File

@ -0,0 +1,215 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_hip
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair;
template<int DataLayout>
void test_hip_contraction(int m_size, int k_size, int n_size)
{
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
// with these dimensions, the output has 300 * 140 elements, which is
// more than 30 * 1024, which is the number of threads in blocks on
// a 15 SM GK110 GPU
Tensor<float, 2, DataLayout> t_left(m_size, k_size);
Tensor<float, 2, DataLayout> t_right(k_size, n_size);
Tensor<float, 2, DataLayout> t_result(m_size, n_size);
Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
Eigen::array<DimPair, 1> dims(DimPair(1, 0));
t_left.setRandom();
t_right.setRandom();
std::size_t t_left_bytes = t_left.size() * sizeof(float);
std::size_t t_right_bytes = t_right.size() * sizeof(float);
std::size_t t_result_bytes = t_result.size() * sizeof(float);
float* d_t_left;
float* d_t_right;
float* d_t_result;
hipMalloc((void**)(&d_t_left), t_left_bytes);
hipMalloc((void**)(&d_t_right), t_right_bytes);
hipMalloc((void**)(&d_t_result), t_result_bytes);
hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
t_result = t_left.contract(t_right, dims);
hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
for (DenseIndex i = 0; i < t_result.size(); i++) {
if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
continue;
}
if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
continue;
}
std::cout << "mismatch detected at index " << i << ": " << t_result(i)
<< " vs " << t_result_gpu(i) << std::endl;
assert(false);
}
hipFree((void*)d_t_left);
hipFree((void*)d_t_right);
hipFree((void*)d_t_result);
}
template<int DataLayout>
void test_scalar(int m_size, int k_size, int n_size)
{
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
// with these dimensions, the output has 300 * 140 elements, which is
// more than 30 * 1024, which is the number of threads in blocks on
// a 15 SM GK110 GPU
Tensor<float, 2, DataLayout> t_left(m_size, k_size);
Tensor<float, 2, DataLayout> t_right(k_size, n_size);
Tensor<float, 0, DataLayout> t_result;
Tensor<float, 0, DataLayout> t_result_gpu;
Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1));
t_left.setRandom();
t_right.setRandom();
std::size_t t_left_bytes = t_left.size() * sizeof(float);
std::size_t t_right_bytes = t_right.size() * sizeof(float);
std::size_t t_result_bytes = sizeof(float);
float* d_t_left;
float* d_t_right;
float* d_t_result;
hipMalloc((void**)(&d_t_left), t_left_bytes);
hipMalloc((void**)(&d_t_right), t_right_bytes);
hipMalloc((void**)(&d_t_result), t_result_bytes);
hipMemcpy(d_t_left, t_left.data(), t_left_bytes, hipMemcpyHostToDevice);
hipMemcpy(d_t_right, t_right.data(), t_right_bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
gpu_t_left(d_t_left, m_size, k_size);
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
gpu_t_right(d_t_right, k_size, n_size);
Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> >
gpu_t_result(d_t_result);
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
t_result = t_left.contract(t_right, dims);
hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
!Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
std::cout << "mismatch detected: " << t_result()
<< " vs " << t_result_gpu() << std::endl;
assert(false);
}
hipFree((void*)d_t_left);
hipFree((void*)d_t_right);
hipFree((void*)d_t_result);
}
template<int DataLayout>
void test_hip_contraction_m() {
for (int k = 32; k < 256; k++) {
test_hip_contraction<ColMajor>(k, 128, 128);
test_hip_contraction<RowMajor>(k, 128, 128);
}
}
template<int DataLayout>
void test_hip_contraction_k() {
for (int k = 32; k < 256; k++) {
test_hip_contraction<ColMajor>(128, k, 128);
test_hip_contraction<RowMajor>(128, k, 128);
}
}
template<int DataLayout>
void test_hip_contraction_n() {
for (int k = 32; k < 256; k++) {
test_hip_contraction<ColMajor>(128, 128, k);
test_hip_contraction<RowMajor>(128, 128, k);
}
}
template<int DataLayout>
void test_hip_contraction_sizes() {
int m_sizes[] = { 31, 39, 63, 64, 65,
127, 129, 255, 257 , 511,
512, 513, 1023, 1024, 1025};
int n_sizes[] = { 31, 39, 63, 64, 65,
127, 129, 255, 257, 511,
512, 513, 1023, 1024, 1025};
int k_sizes[] = { 31, 39, 63, 64, 65,
95, 96, 127, 129, 255,
257, 511, 512, 513, 1023,
1024, 1025};
for (int i = 0; i < 15; i++) {
for (int j = 0; j < 15; j++) {
for (int k = 0; k < 17; k++) {
test_hip_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
}
}
}
}
void test_cxx11_tensor_hip()
{
CALL_SUBTEST(test_hip_contraction<ColMajor>(128, 128, 128));
CALL_SUBTEST(test_hip_contraction<RowMajor>(128, 128, 128));
CALL_SUBTEST(test_scalar<ColMajor>(128, 128, 128));
CALL_SUBTEST(test_scalar<RowMajor>(128, 128, 128));
CALL_SUBTEST(test_hip_contraction_m<ColMajor>());
CALL_SUBTEST(test_hip_contraction_m<RowMajor>());
CALL_SUBTEST(test_hip_contraction_k<ColMajor>());
CALL_SUBTEST(test_hip_contraction_k<RowMajor>());
CALL_SUBTEST(test_hip_contraction_n<ColMajor>());
CALL_SUBTEST(test_hip_contraction_n<RowMajor>());
// Commenting out these tests due to long runtimes
// CALL_SUBTEST(test_hip_contraction_sizes<ColMajor>());
// CALL_SUBTEST(test_hip_contraction_sizes<RowMajor>());
}

View File

@ -0,0 +1,389 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_device
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
using Eigen::RowMajor;
// Context for evaluation on cpu
struct CPUContext {
CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
kernel_1d_(0) = 3.14f;
kernel_1d_(1) = 2.7f;
kernel_2d_(0,0) = 3.14f;
kernel_2d_(1,0) = 2.7f;
kernel_2d_(0,1) = 0.2f;
kernel_2d_(1,1) = 7.0f;
kernel_3d_(0,0,0) = 3.14f;
kernel_3d_(0,1,0) = 2.7f;
kernel_3d_(0,0,1) = 0.2f;
kernel_3d_(0,1,1) = 7.0f;
kernel_3d_(1,0,0) = -1.0f;
kernel_3d_(1,1,0) = -0.3f;
kernel_3d_(1,0,1) = -0.7f;
kernel_3d_(1,1,1) = -0.5f;
}
const Eigen::DefaultDevice& device() const { return cpu_device_; }
const Eigen::Tensor<float, 3>& in1() const { return in1_; }
const Eigen::Tensor<float, 3>& in2() const { return in2_; }
Eigen::Tensor<float, 3>& out() { return out_; }
const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
private:
const Eigen::Tensor<float, 3>& in1_;
const Eigen::Tensor<float, 3>& in2_;
Eigen::Tensor<float, 3>& out_;
Eigen::Tensor<float, 1> kernel_1d_;
Eigen::Tensor<float, 2> kernel_2d_;
Eigen::Tensor<float, 3> kernel_3d_;
Eigen::DefaultDevice cpu_device_;
};
// Context for evaluation on GPU
struct GPUContext {
GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
assert(hipMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == hipSuccess);
float kernel_1d_val[] = {3.14f, 2.7f};
assert(hipMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
assert(hipMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == hipSuccess);
float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
assert(hipMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
assert(hipMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == hipSuccess);
float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
assert(hipMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), hipMemcpyHostToDevice) == hipSuccess);
}
~GPUContext() {
assert(hipFree(kernel_1d_) == hipSuccess);
assert(hipFree(kernel_2d_) == hipSuccess);
assert(hipFree(kernel_3d_) == hipSuccess);
}
const Eigen::GpuDevice& device() const { return gpu_device_; }
const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
private:
const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
float* kernel_1d_;
float* kernel_2d_;
float* kernel_3d_;
Eigen::HipStreamDevice stream_;
Eigen::GpuDevice gpu_device_;
};
// The actual expression to evaluate
template <typename Context>
void test_contextual_eval(Context* context)
{
context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
}
template <typename Context>
void test_forced_contextual_eval(Context* context)
{
context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
}
template <typename Context>
void test_compound_assignment(Context* context)
{
context->out().device(context->device()) = context->in1().constant(2.718f);
context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
}
template <typename Context>
void test_contraction(Context* context)
{
Eigen::array<std::pair<int, int>, 2> dims;
dims[0] = std::make_pair(1, 1);
dims[1] = std::make_pair(2, 2);
Eigen::array<int, 2> shape(40, 50*70);
Eigen::DSizes<int, 2> indices(0,0);
Eigen::DSizes<int, 2> sizes(40,40);
context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
}
template <typename Context>
void test_1d_convolution(Context* context)
{
Eigen::DSizes<int, 3> indices(0,0,0);
Eigen::DSizes<int, 3> sizes(40,49,70);
Eigen::array<int, 1> dims(1);
context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
}
template <typename Context>
void test_2d_convolution(Context* context)
{
Eigen::DSizes<int, 3> indices(0,0,0);
Eigen::DSizes<int, 3> sizes(40,49,69);
Eigen::array<int, 2> dims(1,2);
context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
}
template <typename Context>
void test_3d_convolution(Context* context)
{
Eigen::DSizes<int, 3> indices(0,0,0);
Eigen::DSizes<int, 3> sizes(39,49,69);
Eigen::array<int, 3> dims(0,1,2);
context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
}
void test_cpu() {
Eigen::Tensor<float, 3> in1(40,50,70);
Eigen::Tensor<float, 3> in2(40,50,70);
Eigen::Tensor<float, 3> out(40,50,70);
in1 = in1.random() + in1.constant(10.0f);
in2 = in2.random() + in2.constant(10.0f);
CPUContext context(in1, in2, out);
test_contextual_eval(&context);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) {
VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
}
}
}
test_forced_contextual_eval(&context);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) {
VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
}
}
}
test_compound_assignment(&context);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) {
VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
}
}
}
test_contraction(&context);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 40; ++j) {
const float result = out(i,j,0);
float expected = 0;
for (int k = 0; k < 50; ++k) {
for (int l = 0; l < 70; ++l) {
expected += in1(i, k, l) * in2(j, k, l);
}
}
VERIFY_IS_APPROX(expected, result);
}
}
test_1d_convolution(&context);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 70; ++k) {
VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
}
}
}
test_2d_convolution(&context);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 69; ++k) {
const float result = out(i,j,k);
const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
(in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
continue;
}
VERIFY_IS_APPROX(expected, result);
}
}
}
test_3d_convolution(&context);
for (int i = 0; i < 39; ++i) {
for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 69; ++k) {
const float result = out(i,j,k);
const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
(in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
continue;
}
VERIFY_IS_APPROX(expected, result);
}
}
}
}
void test_gpu() {
Eigen::Tensor<float, 3> in1(40,50,70);
Eigen::Tensor<float, 3> in2(40,50,70);
Eigen::Tensor<float, 3> out(40,50,70);
in1 = in1.random() + in1.constant(10.0f);
in2 = in2.random() + in2.constant(10.0f);
std::size_t in1_bytes = in1.size() * sizeof(float);
std::size_t in2_bytes = in2.size() * sizeof(float);
std::size_t out_bytes = out.size() * sizeof(float);
float* d_in1;
float* d_in2;
float* d_out;
hipMalloc((void**)(&d_in1), in1_bytes);
hipMalloc((void**)(&d_in2), in2_bytes);
hipMalloc((void**)(&d_out), out_bytes);
hipMemcpy(d_in1, in1.data(), in1_bytes, hipMemcpyHostToDevice);
hipMemcpy(d_in2, in2.data(), in2_bytes, hipMemcpyHostToDevice);
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
GPUContext context(gpu_in1, gpu_in2, gpu_out);
test_contextual_eval(&context);
assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) {
VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
}
}
}
test_forced_contextual_eval(&context);
assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) {
VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
}
}
}
test_compound_assignment(&context);
assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 50; ++j) {
for (int k = 0; k < 70; ++k) {
VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
}
}
}
test_contraction(&context);
assert(hipMemcpy(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost) == hipSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 40; ++j) {
const float result = out(i,j,0);
float expected = 0;
for (int k = 0; k < 50; ++k) {
for (int l = 0; l < 70; ++l) {
expected += in1(i, k, l) * in2(j, k, l);
}
}
VERIFY_IS_APPROX(expected, result);
}
}
test_1d_convolution(&context);
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 70; ++k) {
VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
}
}
}
test_2d_convolution(&context);
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
for (int i = 0; i < 40; ++i) {
for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 69; ++k) {
const float result = out(i,j,k);
const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
VERIFY_IS_APPROX(expected, result);
}
}
}
/*
test_3d_convolution(&context);
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, context.device().stream()) == hipSuccess);
assert(hipStreamSynchronize(context.device().stream()) == hipSuccess);
for (int i = 0; i < 39; ++i) {
for (int j = 0; j < 49; ++j) {
for (int k = 0; k < 69; ++k) {
const float result = out(i,j,k);
const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
VERIFY_IS_APPROX(expected, result);
}
}
}
*/
}
void test_cxx11_tensor_device()
{
CALL_SUBTEST(test_cpu());
CALL_SUBTEST(test_gpu());
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,498 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_hip
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
template<typename>
void test_hip_numext() {
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101;
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
d_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
d_res_half, num_elem);
Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
d_res_float, num_elem);
gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
Tensor<bool, 1> half_prec(num_elem);
Tensor<bool, 1> full_prec(num_elem);
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
gpu_device.synchronize();
for (int i = 0; i < num_elem; ++i) {
std::cout << "Checking numext " << i << std::endl;
VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
}
gpu_device.deallocate(d_float);
gpu_device.deallocate(d_res_half);
gpu_device.deallocate(d_res_float);
}
#ifdef EIGEN_HAS_HIP_FP16
template<typename>
void test_hip_conversion() {
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101;
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
d_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
d_half, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
d_conv, num_elem);
gpu_float.device(gpu_device) = gpu_float.random();
gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
gpu_conv.device(gpu_device) = gpu_half.cast<float>();
Tensor<float, 1> initial(num_elem);
Tensor<float, 1> final(num_elem);
gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
for (int i = 0; i < num_elem; ++i) {
VERIFY_IS_APPROX(initial(i), final(i));
}
gpu_device.deallocate(d_float);
gpu_device.deallocate(d_half);
gpu_device.deallocate(d_conv);
}
template<typename>
void test_hip_unary() {
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101;
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
d_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
d_res_half, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
d_res_float, num_elem);
gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
gpu_res_float.device(gpu_device) = gpu_float.abs();
gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
Tensor<float, 1> half_prec(num_elem);
Tensor<float, 1> full_prec(num_elem);
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
gpu_device.synchronize();
for (int i = 0; i < num_elem; ++i) {
std::cout << "Checking unary " << i << std::endl;
VERIFY_IS_APPROX(full_prec(i), half_prec(i));
}
gpu_device.deallocate(d_float);
gpu_device.deallocate(d_res_half);
gpu_device.deallocate(d_res_float);
}
template<typename>
void test_hip_elementwise() {
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101;
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
d_float1, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
d_float2, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
d_res_half, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
d_res_float, num_elem);
gpu_float1.device(gpu_device) = gpu_float1.random();
gpu_float2.device(gpu_device) = gpu_float2.random();
gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
Tensor<float, 1> half_prec(num_elem);
Tensor<float, 1> full_prec(num_elem);
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
gpu_device.synchronize();
for (int i = 0; i < num_elem; ++i) {
std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
}
gpu_device.deallocate(d_float1);
gpu_device.deallocate(d_float2);
gpu_device.deallocate(d_res_half);
gpu_device.deallocate(d_res_float);
}
template<typename>
void test_hip_trancendental() {
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101;
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem);
gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
gpu_float3.device(gpu_device) = gpu_float3.random();
gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>();
gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1();
Tensor<float, 1> input1(num_elem);
Tensor<Eigen::half, 1> half_prec1(num_elem);
Tensor<Eigen::half, 1> full_prec1(num_elem);
Tensor<float, 1> input2(num_elem);
Tensor<Eigen::half, 1> half_prec2(num_elem);
Tensor<Eigen::half, 1> full_prec2(num_elem);
Tensor<float, 1> input3(num_elem);
Tensor<Eigen::half, 1> half_prec3(num_elem);
Tensor<Eigen::half, 1> full_prec3(num_elem);
gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
gpu_device.synchronize();
for (int i = 0; i < num_elem; ++i) {
std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
}
for (int i = 0; i < num_elem; ++i) {
std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
else
VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
}
for (int i = 0; i < num_elem; ++i) {
std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
}
gpu_device.deallocate(d_float1);
gpu_device.deallocate(d_float2);
gpu_device.deallocate(d_float3);
gpu_device.deallocate(d_res1_half);
gpu_device.deallocate(d_res1_float);
gpu_device.deallocate(d_res2_half);
gpu_device.deallocate(d_res2_float);
gpu_device.deallocate(d_res3_float);
gpu_device.deallocate(d_res3_half);
}
template<typename>
void test_hip_contractions() {
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int rows = 23;
int cols = 23;
int num_elem = rows*cols;
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
d_float1, rows, cols);
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
d_float2, rows, cols);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
d_res_half, rows, cols);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
d_res_float, rows, cols);
gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
typedef Tensor<float, 2>::DimensionPair DimPair;
Eigen::array<DimPair, 1> dims(DimPair(1, 0));
gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
Tensor<Eigen::half, 2> half_prec(rows, cols);
Tensor<Eigen::half, 2> full_prec(rows, cols);
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
gpu_device.synchronize();
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
}
}
}
gpu_device.deallocate(d_float1);
gpu_device.deallocate(d_float2);
gpu_device.deallocate(d_res_half);
gpu_device.deallocate(d_res_float);
}
template<typename>
void test_hip_reductions(int size1, int size2, int redux) {
std::cout << "Reducing " << size1 << " by " << size2
<< " tensor along dim " << redux << std::endl;
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int num_elem = size1*size2;
int result_size = (redux == 1 ? size1 : size2);
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
d_float1, size1, size2);
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
d_float2, size1, size2);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
d_res_half, result_size);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
d_res_float, result_size);
gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
Eigen::array<int, 1> redux_dim(redux);
gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
Tensor<Eigen::half, 1> half_prec(result_size);
Tensor<Eigen::half, 1> full_prec(result_size);
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
gpu_device.synchronize();
for (int i = 0; i < result_size; ++i) {
std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
VERIFY_IS_APPROX(full_prec(i), half_prec(i));
}
gpu_device.deallocate(d_float1);
gpu_device.deallocate(d_float2);
gpu_device.deallocate(d_res_half);
gpu_device.deallocate(d_res_float);
}
template<typename>
void test_hip_reductions() {
test_hip_reductions<void>(13, 13, 0);
test_hip_reductions<void>(13, 13, 1);
test_hip_reductions<void>(35, 36, 0);
test_hip_reductions<void>(35, 36, 1);
test_hip_reductions<void>(36, 35, 0);
test_hip_reductions<void>(36, 35, 1);
}
template<typename>
void test_hip_full_reductions() {
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int size = 13;
int num_elem = size*size;
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
d_float1, size, size);
Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
d_float2, size, size);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
d_res_half);
Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
d_res_float);
gpu_float1.device(gpu_device) = gpu_float1.random();
gpu_float2.device(gpu_device) = gpu_float2.random();
gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
Tensor<Eigen::half, 0> half_prec;
Tensor<Eigen::half, 0> full_prec;
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
gpu_device.synchronize();
VERIFY_IS_APPROX(full_prec(), half_prec());
gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
gpu_device.synchronize();
VERIFY_IS_APPROX(full_prec(), half_prec());
gpu_device.deallocate(d_float1);
gpu_device.deallocate(d_float2);
gpu_device.deallocate(d_res_half);
gpu_device.deallocate(d_res_float);
}
template<typename>
void test_hip_forced_evals() {
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
int num_elem = 101;
float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
d_float, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
d_res_half1, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
d_res_half2, num_elem);
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
d_res_float, num_elem);
Eigen::array<int, 1> no_bcast;
no_bcast[0] = 1;
gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
gpu_res_float.device(gpu_device) = gpu_float.abs();
gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
Tensor<float, 1> half_prec1(num_elem);
Tensor<float, 1> half_prec2(num_elem);
Tensor<float, 1> full_prec(num_elem);
gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
gpu_device.synchronize();
for (int i = 0; i < num_elem; ++i) {
std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
}
gpu_device.deallocate(d_float);
gpu_device.deallocate(d_res_half1);
gpu_device.deallocate(d_res_half2);
gpu_device.deallocate(d_res_float);
}
#endif
void test_cxx11_tensor_of_float16_hip()
{
CALL_SUBTEST(test_hip_numext<void>());
#ifdef EIGEN_HAS_HIP_FP16
CALL_SUBTEST(test_hip_conversion<void>());
CALL_SUBTEST(test_hip_unary<void>());
CALL_SUBTEST(test_hip_elementwise<void>());
CALL_SUBTEST(test_hip_trancendental<void>());
CALL_SUBTEST(test_hip_contractions<void>());
CALL_SUBTEST(test_hip_reductions<void>());
CALL_SUBTEST(test_hip_full_reductions<void>());
CALL_SUBTEST(test_hip_forced_evals<void>());
#else
std::cout << "Half floats are not supported by this version of hip: skipping the test" << std::endl;
#endif
}

View File

@ -0,0 +1,85 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_random_hip
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
#include "main.h"
#include <Eigen/CXX11/Tensor>
void test_hip_random_uniform()
{
Tensor<float, 2> out(72,97);
out.setZero();
std::size_t out_bytes = out.size() * sizeof(float);
float* d_out;
hipMalloc((void**)(&d_out), out_bytes);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
gpu_out.device(gpu_device) = gpu_out.random();
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
// For now we just check thes code doesn't crash.
// TODO: come up with a valid test of randomness
}
void test_hip_random_normal()
{
Tensor<float, 2> out(72,97);
out.setZero();
std::size_t out_bytes = out.size() * sizeof(float);
float* d_out;
hipMalloc((void**)(&d_out), out_bytes);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
Eigen::internal::NormalRandomGenerator<float> gen(true);
gpu_out.device(gpu_device) = gpu_out.random(gen);
assert(hipMemcpyAsync(out.data(), d_out, out_bytes, hipMemcpyDeviceToHost, gpu_device.stream()) == hipSuccess);
assert(hipStreamSynchronize(gpu_device.stream()) == hipSuccess);
}
static void test_complex()
{
Tensor<std::complex<float>, 1> vec(6);
vec.setRandom();
// Fixme: we should check that the generated numbers follow a uniform
// distribution instead.
for (int i = 1; i < 6; ++i) {
VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
}
}
void test_cxx11_tensor_random_hip()
{
CALL_SUBTEST(test_hip_random_uniform());
CALL_SUBTEST(test_hip_random_normal());
CALL_SUBTEST(test_complex());
}

View File

@ -0,0 +1,154 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_reduction_hip
#define EIGEN_USE_GPU
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
template<typename Type, int DataLayout>
static void test_full_reductions() {
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
const int num_rows = internal::random<int>(1024, 5*1024);
const int num_cols = internal::random<int>(1024, 5*1024);
Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
in.setRandom();
Tensor<Type, 0, DataLayout> full_redux;
full_redux = in.sum();
std::size_t in_bytes = in.size() * sizeof(Type);
std::size_t out_bytes = full_redux.size() * sizeof(Type);
Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
out_gpu.device(gpu_device) = in_gpu.sum();
Tensor<Type, 0, DataLayout> full_redux_gpu;
gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
gpu_device.synchronize();
// Check that the CPU and GPU reductions return the same result.
VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
gpu_device.deallocate(gpu_in_ptr);
gpu_device.deallocate(gpu_out_ptr);
}
template<typename Type, int DataLayout>
static void test_first_dim_reductions() {
int dim_x = 33;
int dim_y = 1;
int dim_z = 128;
Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
in.setRandom();
Eigen::array<int, 1> red_axis;
red_axis[0] = 0;
Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
// Create device
Eigen::HipStreamDevice stream;
Eigen::GpuDevice dev(&stream);
// Create data(T)
Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type));
Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z);
// Perform operation
dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
gpu_out.device(dev) = gpu_in.sum(red_axis);
gpu_out.device(dev) += gpu_in.sum(red_axis);
Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z);
dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
dev.synchronize();
// Check that the CPU and GPU reductions return the same result.
for (int i = 0; i < gpu_out.size(); ++i) {
VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
}
dev.deallocate(in_data);
dev.deallocate(out_data);
}
template<typename Type, int DataLayout>
static void test_last_dim_reductions() {
int dim_x = 128;
int dim_y = 1;
int dim_z = 33;
Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
in.setRandom();
Eigen::array<int, 1> red_axis;
red_axis[0] = 2;
Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
// Create device
Eigen::HipStreamDevice stream;
Eigen::GpuDevice dev(&stream);
// Create data
Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type));
Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y);
// Perform operation
dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
gpu_out.device(dev) = gpu_in.sum(red_axis);
gpu_out.device(dev) += gpu_in.sum(red_axis);
Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y);
dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
dev.synchronize();
// Check that the CPU and GPU reductions return the same result.
for (int i = 0; i < gpu_out.size(); ++i) {
VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
}
dev.deallocate(in_data);
dev.deallocate(out_data);
}
void test_cxx11_tensor_reduction_hip() {
CALL_SUBTEST((test_full_reductions<float, ColMajor>()));
CALL_SUBTEST((test_full_reductions<double, ColMajor>()));
CALL_SUBTEST((test_full_reductions<float, RowMajor>()));
CALL_SUBTEST((test_full_reductions<double, RowMajor>()));
CALL_SUBTEST((test_first_dim_reductions<float, ColMajor>()));
CALL_SUBTEST((test_first_dim_reductions<double, ColMajor>()));
CALL_SUBTEST((test_first_dim_reductions<float, RowMajor>()));
// Outer reductions of doubles aren't supported just yet.
// CALL_SUBTEST((test_first_dim_reductions<double, RowMajor>()))
CALL_SUBTEST((test_last_dim_reductions<float, ColMajor>()));
// Outer reductions of doubles aren't supported just yet.
// CALL_SUBTEST((test_last_dim_reductions<double, ColMajor>()));
CALL_SUBTEST((test_last_dim_reductions<float, RowMajor>()));
CALL_SUBTEST((test_last_dim_reductions<double, RowMajor>()));
}

View File

@ -0,0 +1,76 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_TEST_FUNC cxx11_tensor_scan_hip
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair;
template<int DataLayout>
void test_hip_cumsum(int m_size, int k_size, int n_size)
{
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size);
Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size);
t_input.setRandom();
std::size_t t_input_bytes = t_input.size() * sizeof(float);
std::size_t t_result_bytes = t_result.size() * sizeof(float);
float* d_t_input;
float* d_t_result;
hipMalloc((void**)(&d_t_input), t_input_bytes);
hipMalloc((void**)(&d_t_result), t_result_bytes);
hipMemcpy(d_t_input, t_input.data(), t_input_bytes, hipMemcpyHostToDevice);
Eigen::HipStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size));
Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size));
gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
t_result = t_input.cumsum(1);
hipMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, hipMemcpyDeviceToHost);
for (DenseIndex i = 0; i < t_result.size(); i++) {
if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
continue;
}
if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
continue;
}
std::cout << "mismatch detected at index " << i << ": " << t_result(i)
<< " vs " << t_result_gpu(i) << std::endl;
assert(false);
}
hipFree((void*)d_t_input);
hipFree((void*)d_t_result);
}
void test_cxx11_tensor_scan_hip()
{
CALL_SUBTEST(test_hip_cumsum<ColMajor>(128, 128, 128));
CALL_SUBTEST(test_hip_cumsum<RowMajor>(128, 128, 128));
}