mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-21 07:19:46 +08:00
Added support for simple coefficient wise tensor expression using half floats on CUDA devices
This commit is contained in:
parent
0606a0a39b
commit
ac5d706a94
@ -333,6 +333,7 @@ using std::ptrdiff_t;
|
||||
|
||||
#if defined EIGEN_VECTORIZE_CUDA
|
||||
#include "src/Core/arch/CUDA/PacketMath.h"
|
||||
#include "src/Core/arch/CUDA/PacketMathHalf.h"
|
||||
#include "src/Core/arch/CUDA/MathFunctions.h"
|
||||
#include "src/Core/arch/CUDA/TypeCasting.h"
|
||||
#endif
|
||||
|
220
Eigen/src/Core/arch/CUDA/PacketMathHalf.h
Normal file
220
Eigen/src/Core/arch/CUDA/PacketMathHalf.h
Normal file
@ -0,0 +1,220 @@
|
||||
// This file is part of Eigen, a lightweight C++ template library
|
||||
// for linear algebra.
|
||||
//
|
||||
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||
//
|
||||
// This Source Code Form is subject to the terms of the Mozilla
|
||||
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
|
||||
#define EIGEN_PACKET_MATH_HALF_CUDA_H
|
||||
|
||||
namespace Eigen {
|
||||
|
||||
namespace internal {
|
||||
|
||||
#if defined(EIGEN_HAS_CUDA_FP16)
|
||||
|
||||
// Make sure this is only available when targeting a GPU: we don't want to
|
||||
// introduce conflicts between these packet_traits definitions and the ones
|
||||
// we'll use on the host side (SSE, AVX, ...)
|
||||
#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
|
||||
|
||||
__device__ half operator + (const half& a, const half& b) {
|
||||
return __hadd(a, b);
|
||||
}
|
||||
__device__ half operator * (const half& a, const half& b) {
|
||||
return __hmul(a, b);
|
||||
}
|
||||
__device__ half operator - (const half& a, const half& b) {
|
||||
return __hsub(a, b);
|
||||
}
|
||||
__device__ half operator / (const half& a, const half& b) {
|
||||
assert(false && "tbd");
|
||||
return half();
|
||||
}
|
||||
__device__ half operator - (const half& a) {
|
||||
return __hneg(a);
|
||||
}
|
||||
|
||||
|
||||
template<> struct is_arithmetic<half2> { enum { value = true }; };
|
||||
|
||||
template<> struct packet_traits<half> : default_packet_traits
|
||||
{
|
||||
typedef half2 type;
|
||||
typedef half2 half;
|
||||
enum {
|
||||
Vectorizable = 1,
|
||||
AlignedOnScalar = 1,
|
||||
size=2,
|
||||
HasHalfPacket = 0,
|
||||
|
||||
HasDiv = 1,
|
||||
HasLog = 1,
|
||||
HasExp = 1,
|
||||
HasSqrt = 1,
|
||||
HasRsqrt = 1,
|
||||
HasLGamma = 1,
|
||||
HasDiGamma = 1,
|
||||
HasErf = 1,
|
||||
HasErfc = 1,
|
||||
|
||||
HasBlend = 0,
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
template<> struct unpacket_traits<half2> { typedef half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const half& from) {
|
||||
return __half2half2(from);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset<half2>(const half& a) {
|
||||
return __halves2half2(a, __hadd(a, __float2half(1)));
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
|
||||
return __hadd2(a, b);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
|
||||
return __hsub2(a, b);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
|
||||
return __hneg2(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
|
||||
return __hmul2(a, b);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
|
||||
return __hfma2(a, b, c);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
float r1 = a1 / b1;
|
||||
float r2 = a2 / b2;
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
|
||||
half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
|
||||
return __halves2half2(r1, r2);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
|
||||
float a1 = __low2float(a);
|
||||
float a2 = __high2float(a);
|
||||
float b1 = __low2float(b);
|
||||
float b2 = __high2float(b);
|
||||
half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
|
||||
half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
|
||||
return __halves2half2(r1, r2);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload<half2>(const half* from) {
|
||||
return *reinterpret_cast<const half2*>(from);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu<half2>(const half* from) {
|
||||
return __halves2half2(from[0], from[1]);
|
||||
}
|
||||
|
||||
template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const half* from) {
|
||||
return __halves2half2(from[0], from[0]);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<half>(half* to, const half2& from) {
|
||||
*reinterpret_cast<half2*>(to) = from;
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<half>(half* to, const half2& from) {
|
||||
to[0] = __low2half(from);
|
||||
to[1] = __high2half(from);
|
||||
}
|
||||
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const half* from) {
|
||||
return __ldg((const half2*)from);
|
||||
}
|
||||
|
||||
template<>
|
||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const half* from) {
|
||||
return __halves2half2(__ldg(from+0), __ldg(from+1));
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline half2 pgather<half, half2>(const half* from, Index stride) {
|
||||
return __halves2half2(from[0*stride], from[1*stride]);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline void pscatter<half, half2>(half* to, const half2& from, Index stride) {
|
||||
to[stride*0] = __low2half(from);
|
||||
to[stride*1] = __high2half(from);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline half pfirst<half2>(const half2& a) {
|
||||
return __low2half(a);
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline half predux<half2>(const half2& a) {
|
||||
return __hadd(__low2half(a), __high2half(a));
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline half predux_max<half2>(const half2& a) {
|
||||
half first = __low2half(a);
|
||||
half second = __high2half(a);
|
||||
return __hgt(first, second) ? first : second;
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline half predux_min<half2>(const half2& a) {
|
||||
half first = __low2half(a);
|
||||
half second = __high2half(a);
|
||||
return __hlt(first, second) ? first : second;
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline half predux_mul<half2>(const half2& a) {
|
||||
return __hmul(__low2half(a), __high2half(a));
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC inline half2 pabs<half2>(const half2& a) {
|
||||
assert(false && "tbd");
|
||||
return half2();
|
||||
}
|
||||
|
||||
|
||||
EIGEN_DEVICE_FUNC inline void
|
||||
ptranspose(PacketBlock<half2,2>& kernel) {
|
||||
assert(false && "tbd");
|
||||
// half tmp = kernel.packet[0].y;
|
||||
// kernel.packet[0].y = kernel.packet[1].x;
|
||||
// kernel.packet[1].x = tmp;
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
} // end namespace internal
|
||||
|
||||
} // end namespace Eigen
|
||||
|
||||
|
||||
#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
|
@ -20,6 +20,7 @@
|
||||
using Eigen::Tensor;
|
||||
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
|
||||
void test_cuda_conversion() {
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
@ -53,11 +54,53 @@ void test_cuda_conversion() {
|
||||
gpu_device.deallocate(d_half);
|
||||
gpu_device.deallocate(d_conv);
|
||||
}
|
||||
|
||||
void test_cuda_elementwise() {
|
||||
Eigen::CudaStreamDevice stream;
|
||||
Eigen::GpuDevice gpu_device(&stream);
|
||||
int num_elem = 101;
|
||||
|
||||
float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
|
||||
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
|
||||
d_float1, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
|
||||
d_float2, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
|
||||
d_res_half, num_elem);
|
||||
Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
|
||||
d_res_float, num_elem);
|
||||
|
||||
gpu_float1.device(gpu_device) = gpu_float1.random();
|
||||
gpu_float2.device(gpu_device) = gpu_float2.random();
|
||||
gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
|
||||
gpu_res_half.device(gpu_device) = ((gpu_float1.cast<half>() + gpu_float2.cast<half>()) * gpu_float1.cast<half>()).cast<float>();
|
||||
|
||||
Tensor<float, 1> half_prec(num_elem);
|
||||
Tensor<float, 1> full_prec(num_elem);
|
||||
gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
|
||||
gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
|
||||
|
||||
for (int i = 0; i < num_elem; ++i) {
|
||||
VERIFY_IS_APPROX(full_prec(i), half_prec(i));
|
||||
}
|
||||
|
||||
gpu_device.deallocate(d_float1);
|
||||
gpu_device.deallocate(d_float2);
|
||||
gpu_device.deallocate(d_res_half);
|
||||
gpu_device.deallocate(d_res_float);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
void test_cxx11_tensor_of_float16_cuda()
|
||||
{
|
||||
#ifdef EIGEN_HAS_CUDA_FP16
|
||||
CALL_SUBTEST_1(test_cuda_conversion());
|
||||
CALL_SUBTEST_1(test_cuda_element_wise());
|
||||
#endif
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user