From 048c4d6efd34ae26cebf5a6b662d4480dfe61f0e Mon Sep 17 00:00:00 2001 From: Benoit Steiner Date: Fri, 11 Mar 2016 17:21:42 -0800 Subject: [PATCH] Made half floats usable on hardware that doesn't support them natively. --- Eigen/Core | 2 + Eigen/src/Core/arch/CUDA/Half.h | 337 ++++++++++++++++++ Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 46 --- .../test/cxx11_tensor_cast_float16_cuda.cu | 16 +- 4 files changed, 343 insertions(+), 58 deletions(-) create mode 100644 Eigen/src/Core/arch/CUDA/Half.h diff --git a/Eigen/Core b/Eigen/Core index 7107f83d0..8428c51e4 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -331,6 +331,8 @@ using std::ptrdiff_t; #include "src/Core/arch/NEON/Complex.h" #endif +#include "src/Core/arch/CUDA/Half.h" + #if defined EIGEN_VECTORIZE_CUDA #include "src/Core/arch/CUDA/PacketMath.h" #include "src/Core/arch/CUDA/PacketMathHalf.h" diff --git a/Eigen/src/Core/arch/CUDA/Half.h b/Eigen/src/Core/arch/CUDA/Half.h new file mode 100644 index 000000000..419790c3e --- /dev/null +++ b/Eigen/src/Core/arch/CUDA/Half.h @@ -0,0 +1,337 @@ +// Standard 16-bit float type, mostly useful for GPUs. Defines a new +// class Eigen::half (inheriting from CUDA's __half struct) with +// operator overloads such that it behaves basically as an arithmetic +// type. It will be quite slow on CPUs (so it is recommended to stay +// in fp32 for CPUs, except for simple parameter conversions, I/O +// to disk and the likes), but fast on GPUs. +// +// +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// +// The conversion routines are Copyright (c) Fabian Giesen, 2016. +// The original license follows: +// +// Copyright (c) Fabian Giesen, 2016 +// All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted. +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef EIGEN_HALF_CUDA_H +#define EIGEN_HALF_CUDA_H + +#if !defined(EIGEN_HAS_CUDA_FP16) + +// Make our own __half definition that is similar to CUDA's. +struct __half { + uint16_t x; +}; + +#endif + +namespace Eigen { + +namespace internal { + +static inline EIGEN_DEVICE_FUNC __half raw_uint16_to_half(uint16_t x); +static inline EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); +static inline EIGEN_DEVICE_FUNC float half_to_float(__half h); + +} // end namespace internal + +// Class definition. +struct half : public __half { + EIGEN_DEVICE_FUNC half() : __half(internal::raw_uint16_to_half(0)) {} + + // TODO(sesse): Should these conversions be marked as explicit? + EIGEN_DEVICE_FUNC half(float f) : __half(internal::float_to_half_rtne(f)) {} + EIGEN_DEVICE_FUNC half(int i) : __half(internal::float_to_half_rtne(i)) {} + EIGEN_DEVICE_FUNC half(double d) : __half(internal::float_to_half_rtne(d)) {} + EIGEN_DEVICE_FUNC half(bool b) + : __half(internal::raw_uint16_to_half(b ? 0x3c00 : 0)) {} + EIGEN_DEVICE_FUNC half(const __half& h) : __half(h) {} + EIGEN_DEVICE_FUNC half(const half& h) : __half(h) {} + EIGEN_DEVICE_FUNC half(const volatile half& h) + : __half(internal::raw_uint16_to_half(h.x)) {} + + EIGEN_DEVICE_FUNC explicit operator float() const { + return internal::half_to_float(*this); + } + EIGEN_DEVICE_FUNC explicit operator double() const { + return internal::half_to_float(*this); + } + + EIGEN_DEVICE_FUNC half& operator=(const half& other) { + x = other.x; + return *this; + } + EIGEN_DEVICE_FUNC half& operator=(const volatile half& other) { + x = other.x; + return *this; + } + EIGEN_DEVICE_FUNC volatile half& operator=(const half& other) volatile { + x = other.x; + return *this; + } +}; + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + +// Intrinsics for native fp16 support. Note that on current hardware, +// these are no faster than fp32 arithmetic (you need to use the half2 +// versions to get the ALU speed increased), but you do save the +// conversion steps back and forth. + +__device__ half operator + (const half& a, const half& b) { + return __hadd(a, b); +} +__device__ half operator * (const half& a, const half& b) { + return __hmul(a, b); +} +__device__ half operator - (const half& a, const half& b) { + return __hsub(a, b); +} +__device__ half operator / (const half& a, const half& b) { + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); +} +__device__ half operator - (const half& a) { + return __hneg(a); +} +__device__ half& operator += (half& a, const half& b) { + a = a + b; + return a; +} +__device__ half& operator *= (half& a, const half& b) { + a = a * b; + return a; +} +__device__ half& operator -= (half& a, const half& b) { + a = a - b; + return a; +} +__device__ half& operator /= (half& a, const half& b) { + a = a / b; + return a; +} +__device__ bool operator == (const half& a, const half& b) { + return __heq(a, b); +} +__device__ bool operator != (const half& a, const half& b) { + return __hne(a, b); +} +__device__ bool operator < (const half& a, const half& b) { + return __hle(a, b); +} +__device__ bool operator > (const half& a, const half& b) { + return __hgt(a, b); +} + +#else // Not CUDA 530 + +// Definitions for CPUs and older CUDA, mostly working through conversion +// to/from fp32. + +static inline EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { + return half(float(a) + float(b)); +} +static inline EIGEN_DEVICE_FUNC half operator * (const half& a, const half& b) { + return half(float(a) * float(b)); +} +static inline EIGEN_DEVICE_FUNC half operator - (const half& a, const half& b) { + return half(float(a) - float(b)); +} +static inline EIGEN_DEVICE_FUNC half operator / (const half& a, const half& b) { + return half(float(a) / float(b)); +} +static inline EIGEN_DEVICE_FUNC half operator - (const half& a) { + half result; + result.x = a.x ^ 0x8000; + return result; +} +static inline EIGEN_DEVICE_FUNC half& operator += (half& a, const half& b) { + a = half(float(a) + float(b)); + return a; +} +static inline EIGEN_DEVICE_FUNC half& operator *= (half& a, const half& b) { + a = half(float(a) * float(b)); + return a; +} +static inline EIGEN_DEVICE_FUNC half& operator -= (half& a, const half& b) { + a = half(float(a) - float(b)); + return a; +} +static inline EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) { + a = half(float(a) / float(b)); + return a; +} +static inline EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { + return float(a) == float(b); +} +static inline EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { + return float(a) != float(b); +} +static inline EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { + return float(a) < float(b); +} +static inline EIGEN_DEVICE_FUNC bool operator > (const half& a, const half& b) { + return float(a) > float(b); +} + +#endif // Not CUDA 530 + +// Conversion routines, including fallbacks for the host or older CUDA. +// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of +// these in hardware. If we need more performance on older/other CPUs, they are +// also possible to vectorize directly. + +namespace internal { + +static inline EIGEN_DEVICE_FUNC __half raw_uint16_to_half(uint16_t x) { + __half h; + h.x = x; + return h; +} + +union FP32 { + uint32_t u; + float f; +}; + +static inline EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { +#if defined(__CUDA_ARCH__) && defined(EIGEN_HAS_CUDA_FP16) + return __float2half(ff); +#else + FP32 f; f.f = ff; + + const FP32 f32infty = { 255 << 23 }; + const FP32 f16max = { (127 + 16) << 23 }; + const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + uint32_t sign_mask = 0x80000000u; + __half o = { 0 }; + + uint32_t sign = f.u & sign_mask; + f.u ^= sign; + + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + + // and one integer subtract of the bias later, we have our final float! + o.x = f.u - denorm_magic.u; + } else { + uint32_t mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + f.u += ((uint32_t)(15 - 127) << 23) + 0xfff; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + o.x = f.u >> 13; + } + } + + o.x |= sign >> 16; + return o; +#endif +} + +static inline EIGEN_DEVICE_FUNC float half_to_float(__half h) { +#if defined(__CUDA_ARCH__) && defined(EIGEN_HAS_CUDA_FP16) + return __half2float(h); +#else + const FP32 magic = { 113 << 23 }; + const uint32_t shifted_exp = 0x7c00 << 13; // exponent mask after shift + FP32 o; + + o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits + uint32_t exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // renormalize + } + + o.u |= (h.x & 0x8000) << 16; // sign bit + return o.f; +#endif +} + +} // end namespace internal + +// Traits. + +namespace internal { + +template<> struct is_arithmetic { enum { value = true }; }; + +} // end namespace internal + +// Infinity/NaN checks. + +namespace numext { + +static inline EIGEN_DEVICE_FUNC bool (isinf)(const Eigen::half& a) { + return (a.x & 0x7fff) == 0x7c00; +} +static inline EIGEN_HALF_CUDA_H bool (isnan)(const Eigen::half& a) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hisnan(x); +#else + return (a.x & 0x7fff) > 0x7c00; +#endif +} + +} // end namespace numext + +} // end namespace Eigen + +// Standard mathematical functions and trancendentals. + +namespace std { + +static inline EIGEN_DEVICE_FUNC Eigen::half abs(const Eigen::half& a) { + Eigen::half result; + result.x = a.x & 0x7FFF; + return result; +} +static inline EIGEN_DEVICE_FUNC Eigen::half exp(const Eigen::half& a) { + return Eigen::half(expf(float(a))); +} +static inline EIGEN_DEVICE_FUNC Eigen::half log(const Eigen::half& a) { + return Eigen::half(logf(float(a))); +} + +} // end namespace std + +#endif // EIGEN_HALF_CUDA_H diff --git a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index 9e3c51d49..9e1d87062 100644 --- a/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -19,55 +19,9 @@ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 -// The following operations require arch >= 5.3 -#if __CUDA_ARCH__ >= 530 -__device__ half operator + (const half& a, const half& b) { - return __hadd(a, b); -} -__device__ half operator * (const half& a, const half& b) { - return __hmul(a, b); -} -__device__ half operator - (const half& a, const half& b) { - return __hsub(a, b); -} -__device__ half operator / (const half& a, const half& b) { - float num = __half2float(a); - float denom = __half2float(b); - return __float2half(num / denom); -} -__device__ half operator - (const half& a) { - return __hneg(a); -} -__device__ half& operator += (half& a, const half& b) { - a = a + b; - return a; -} -__device__ half& operator *= (half& a, const half& b) { - a = a * b; - return a; -} -__device__ half& operator -= (half& a, const half& b) { - a = a - b; - return a; -} -__device__ half& operator /= (half& a, const half& b) { - a = a / b; - return a; -} - -namespace std { -__device__ half abs(const half& a) { - half result; - result.x = a.x & 0x7FFF; - return result; -} -} -#endif - namespace Eigen { namespace internal { -template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct packet_traits : default_packet_traits diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu index 7936a9126..d9ed4c855 100644 --- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu @@ -19,30 +19,28 @@ using Eigen::Tensor; -#ifdef EIGEN_HAS_CUDA_FP16 - void test_cuda_conversion() { Eigen::CudaStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; - + Tensor floats(num_elem); floats.setRandom(); float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); - half* d_half = (half*)gpu_device.allocate(num_elem * sizeof(half)); + Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::TensorMap, Eigen::Aligned> gpu_float( d_float, num_elem); - Eigen::TensorMap, Eigen::Aligned> gpu_half( + Eigen::TensorMap, Eigen::Aligned> gpu_half( d_half, num_elem); Eigen::TensorMap, Eigen::Aligned> gpu_conv( d_conv, num_elem); gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float)); - gpu_half.device(gpu_device) = gpu_float.cast(); + gpu_half.device(gpu_device) = gpu_float.cast(); gpu_conv.device(gpu_device) = gpu_half.cast(); Tensor initial(num_elem); @@ -60,14 +58,8 @@ void test_cuda_conversion() { gpu_device.deallocate(d_conv); } -#endif - void test_cxx11_tensor_cast_float16_cuda() { -#ifdef EIGEN_HAS_CUDA_FP16 CALL_SUBTEST(test_cuda_conversion()); -#else - std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl; -#endif }