mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-21 07:19:46 +08:00
Clean up CUDA/NVCC version macros and their use in Eigen, and a few other CUDA build failures.
This commit is contained in:
parent
5614400581
commit
b08527b0c1
@ -138,6 +138,7 @@ template <typename Derived>
|
||||
struct coeff_visitor
|
||||
{
|
||||
// default initialization to avoid countless invalid maybe-uninitialized warnings by gcc
|
||||
EIGEN_DEVICE_FUNC
|
||||
coeff_visitor() : row(-1), col(-1), res(0) {}
|
||||
typedef typename Derived::Scalar Scalar;
|
||||
Index row, col;
|
||||
|
@ -60,7 +60,7 @@ struct __half_raw {
|
||||
// Nothing to do here
|
||||
// HIP fp16 header file has a definition for __half_raw
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
|
||||
// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
|
||||
typedef __half __half_raw;
|
||||
#endif // defined(EIGEN_HAS_CUDA_FP16)
|
||||
@ -83,7 +83,7 @@ struct half_base : public __half_raw {
|
||||
#if defined(EIGEN_HAS_HIP_FP16)
|
||||
EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); }
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
#if (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000)
|
||||
#if (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000)
|
||||
EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
|
||||
#endif
|
||||
#endif
|
||||
@ -103,11 +103,12 @@ struct half : public half_impl::half_base {
|
||||
// Nothing to do here
|
||||
// HIP fp16 header file has a definition for __half_raw
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
// Note that EIGEN_CUDACC_VER is set to 0 even when compiling with HIP, so (EIGEN_CUDACC_VER < 90000) is true even for HIP!
|
||||
// So keeping this within #if defined(EIGEN_HAS_CUDA_FP16) is needed
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
typedef half_impl::__half_raw __half_raw;
|
||||
#endif
|
||||
// Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
|
||||
// (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP! So keeping this within
|
||||
// #if defined(EIGEN_HAS_CUDA_FP16) is needed
|
||||
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
|
||||
typedef half_impl::__half_raw __half_raw;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
EIGEN_DEVICE_FUNC half() {}
|
||||
@ -119,7 +120,7 @@ struct half : public half_impl::half_base {
|
||||
#if defined(EIGEN_HAS_HIP_FP16)
|
||||
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
|
||||
#elif defined(EIGEN_HAS_CUDA_FP16)
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
|
||||
EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
|
||||
#endif
|
||||
#endif
|
||||
@ -238,9 +239,9 @@ namespace Eigen {
|
||||
|
||||
namespace half_impl {
|
||||
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) || \
|
||||
(defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__))
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
|
||||
EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
|
||||
#define EIGEN_HAS_NATIVE_FP16
|
||||
#endif
|
||||
|
||||
@ -251,7 +252,7 @@ namespace half_impl {
|
||||
|
||||
#if defined(EIGEN_HAS_NATIVE_FP16)
|
||||
EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
|
||||
return __hadd(::__half(a), ::__half(b));
|
||||
#else
|
||||
return __hadd(a, b);
|
||||
@ -264,7 +265,7 @@ EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
|
||||
return __hsub(a, b);
|
||||
}
|
||||
EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
|
||||
#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
|
||||
#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
|
||||
return __hdiv(a, b);
|
||||
#else
|
||||
float num = __half2float(a);
|
||||
@ -312,13 +313,13 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(EIGEN_HAS_NATIVE_FP16) || defined(__clang__) // Emulate support for half floats
|
||||
#if !defined(EIGEN_HAS_NATIVE_FP16) || EIGEN_COMP_CLANG // Emulate support for half floats
|
||||
|
||||
#if defined(__clang__) && defined(__CUDA__)
|
||||
#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
|
||||
// We need to provide emulated *host-side* FP16 operators for clang.
|
||||
#pragma push_macro("EIGEN_DEVICE_FUNC")
|
||||
#undef EIGEN_DEVICE_FUNC
|
||||
#if defined(EIGEN_HAS_GPU_FP16)
|
||||
#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16)
|
||||
#define EIGEN_DEVICE_FUNC __host__
|
||||
#else // both host and device need emulated ops.
|
||||
#define EIGEN_DEVICE_FUNC __host__ __device__
|
||||
@ -517,7 +518,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
|
||||
return result;
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hexp(a));
|
||||
#else
|
||||
@ -528,7 +529,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
|
||||
return half(numext::expm1(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
(defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
|
||||
return half(::hlog(a));
|
||||
#else
|
||||
@ -542,7 +543,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
|
||||
return half(::log10f(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hsqrt(a));
|
||||
#else
|
||||
@ -565,7 +566,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
|
||||
return half(::tanhf(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hfloor(a));
|
||||
#else
|
||||
@ -573,7 +574,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
|
||||
#endif
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return half(hceil(a));
|
||||
#else
|
||||
@ -673,7 +674,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
|
||||
return Eigen::half(::expf(float(a)));
|
||||
}
|
||||
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
return Eigen::half(::hlog(a));
|
||||
#else
|
||||
@ -712,7 +713,7 @@ struct hash<Eigen::half> {
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
|
||||
#if (EIGEN_CUDACC_VER < 90000) || \
|
||||
#if (EIGEN_CUDA_SDK_VER < 90000) || \
|
||||
defined(EIGEN_HAS_HIP_FP16)
|
||||
return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
|
||||
#else
|
||||
|
@ -102,6 +102,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const do
|
||||
return make_double2(from, from);
|
||||
}
|
||||
|
||||
#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG)
|
||||
namespace {
|
||||
|
||||
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
|
||||
@ -213,6 +214,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
|
||||
pcmp_eq<double2>(const double2& a, const double2& b) {
|
||||
return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
|
||||
}
|
||||
#endif // EIGEN_CUDA_ARCH || defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
|
||||
return make_float4(a, a+1, a+2, a+3);
|
||||
|
@ -73,8 +73,13 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen:
|
||||
}
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
|
||||
#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
to[0] = from.x;
|
||||
to[1] = from.y;
|
||||
#else
|
||||
to[0] = __low2half(from);
|
||||
to[1] = __high2half(from);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<>
|
||||
@ -477,7 +482,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1<half2>(const half2
|
||||
return __floats2half2_rn(r1, r2);
|
||||
}
|
||||
|
||||
#if (EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
|
||||
defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
|
||||
template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||
|
@ -380,7 +380,7 @@
|
||||
#if defined EIGEN_CUDACC
|
||||
#define EIGEN_VECTORIZE_GPU
|
||||
#include <vector_types.h>
|
||||
#if EIGEN_CUDACC_VER >= 70500
|
||||
#if EIGEN_CUDA_SDK_VER >= 70500
|
||||
#define EIGEN_HAS_CUDA_FP16
|
||||
#endif
|
||||
#endif
|
||||
|
@ -108,6 +108,18 @@
|
||||
#define EIGEN_COMP_MSVC 0
|
||||
#endif
|
||||
|
||||
#if defined(__NVCC__)
|
||||
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
|
||||
#define EIGEN_COMP_NVCC ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
|
||||
#elif defined(__CUDACC_VER__)
|
||||
#define EIGEN_COMP_NVCC __CUDACC_VER__
|
||||
#else
|
||||
#error "NVCC did not define compiler version."
|
||||
#endif
|
||||
#else
|
||||
#define EIGEN_COMP_NVCC 0
|
||||
#endif
|
||||
|
||||
// For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC:
|
||||
// name ver MSC_VER
|
||||
// 2008 9 1500
|
||||
@ -408,10 +420,11 @@
|
||||
#define EIGEN_CUDA_ARCH __CUDA_ARCH__
|
||||
#endif
|
||||
|
||||
#if defined(CUDA_VERSION)
|
||||
#define EIGEN_CUDACC_VER (CUDA_VERSION*10)
|
||||
#if defined(EIGEN_CUDACC)
|
||||
#include <cuda.h>
|
||||
#define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
|
||||
#else
|
||||
#define EIGEN_CUDACC_VER 0
|
||||
#define EIGEN_CUDA_SDK_VER 0
|
||||
#endif
|
||||
|
||||
#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
|
||||
@ -622,7 +635,7 @@
|
||||
// Does the compiler support variadic templates?
|
||||
#ifndef EIGEN_HAS_VARIADIC_TEMPLATES
|
||||
#if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
|
||||
&& (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) )
|
||||
&& (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_COMP_NVCC >= 80000) )
|
||||
// ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
|
||||
// this prevents nvcc from crashing when compiling Eigen on Tegra X1
|
||||
#define EIGEN_HAS_VARIADIC_TEMPLATES 1
|
||||
@ -637,7 +650,7 @@
|
||||
#ifndef EIGEN_HAS_CONSTEXPR
|
||||
#if defined(EIGEN_CUDACC)
|
||||
// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
|
||||
#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500))
|
||||
#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_COMP_NVCC >= 70500))
|
||||
#define EIGEN_HAS_CONSTEXPR 1
|
||||
#endif
|
||||
#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
|
||||
@ -981,7 +994,7 @@ namespace Eigen {
|
||||
#endif
|
||||
|
||||
|
||||
#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0)
|
||||
#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_COMP_NVCC)
|
||||
// for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
|
||||
#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
|
||||
using Base::operator =;
|
||||
|
@ -1,4 +1,3 @@
|
||||
|
||||
#ifndef EIGEN_TEST_GPU_COMMON_H
|
||||
#define EIGEN_TEST_GPU_COMMON_H
|
||||
|
||||
@ -130,10 +129,14 @@ void ei_test_init_gpu()
|
||||
std::cout << " EIGEN_CUDACC: " << int(EIGEN_CUDACC) << "\n";
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_CUDACC_VER
|
||||
std::cout << " EIGEN_CUDACC_VER: " << int(EIGEN_CUDACC_VER) << "\n";
|
||||
#ifdef EIGEN_CUDA_SDK_VER
|
||||
std::cout << " EIGEN_CUDA_SDK_VER: " << int(EIGEN_CUDA_SDK_VER) << "\n";
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_COMP_NVCC
|
||||
std::cout << " EIGEN_COMP_NVCC: " << int(EIGEN_COMP_NVCC) << "\n";
|
||||
#endif
|
||||
|
||||
#ifdef EIGEN_HIPCC
|
||||
std::cout << " EIGEN_HIPCC: " << int(EIGEN_HIPCC) << "\n";
|
||||
#endif
|
||||
|
18
test/main.h
18
test/main.h
@ -52,15 +52,17 @@
|
||||
#endif
|
||||
|
||||
// Same for cuda_fp16.h
|
||||
#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
|
||||
#define EIGEN_TEST_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
|
||||
#elif defined(__CUDACC_VER__)
|
||||
#define EIGEN_TEST_CUDACC_VER __CUDACC_VER__
|
||||
#else
|
||||
#define EIGEN_TEST_CUDACC_VER 0
|
||||
#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
|
||||
// Means the compiler is either nvcc or clang with CUDA enabled
|
||||
#define EIGEN_CUDACC __CUDACC__
|
||||
#endif
|
||||
|
||||
#if EIGEN_TEST_CUDACC_VER >= 70500
|
||||
#if defined(EIGEN_CUDACC)
|
||||
#include <cuda.h>
|
||||
#define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
|
||||
#else
|
||||
#define EIGEN_CUDA_SDK_VER 0
|
||||
#endif
|
||||
#if EIGEN_CUDA_SDK_VER >= 70500
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
|
||||
|
@ -388,7 +388,7 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
|
||||
// the sum across all big k blocks of the product of little k block of index (x, y)
|
||||
// with block of index (y, z). To compute the final output, we need to reduce
|
||||
// the 8 threads over y by summation.
|
||||
#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
|
||||
#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
|
||||
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
|
||||
#else
|
||||
#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
|
||||
@ -621,7 +621,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
|
||||
x1 = rhs_pf0.x;
|
||||
x2 = rhs_pf0.z;
|
||||
}
|
||||
#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
|
||||
#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
|
||||
x1 = __shfl_xor(x1, 4);
|
||||
x2 = __shfl_xor(x2, 4);
|
||||
#else
|
||||
|
@ -81,8 +81,8 @@
|
||||
// gpu_assert can be overridden
|
||||
#ifndef gpu_assert
|
||||
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDACC) && (EIGEN_CUDACC_VER==0))
|
||||
// clang-cuda and HIPCC do not support the use of assert on the GPU side.
|
||||
#if defined(EIGEN_HIP_DEVICE_COMPILE)
|
||||
// HIPCC do not support the use of assert on the GPU side.
|
||||
#define gpu_assert(COND)
|
||||
#else
|
||||
#define gpu_assert(COND) assert(COND)
|
||||
|
@ -177,7 +177,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
|
||||
} else {
|
||||
reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
|
||||
}
|
||||
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
|
||||
reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
|
||||
#else
|
||||
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
|
||||
@ -269,7 +269,7 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
|
||||
wka_in.h = accum;
|
||||
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
|
||||
reducer.reducePacket(wka_out.h, &accum);
|
||||
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
|
||||
reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
|
||||
#else
|
||||
int temp = __shfl_down_sync(0xFFFFFFFF, *(int*)(&accum), (unsigned)offset, warpSize);
|
||||
@ -466,7 +466,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
|
||||
} else {
|
||||
reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
|
||||
}
|
||||
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
|
||||
reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
|
||||
#else
|
||||
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
|
||||
@ -571,7 +571,7 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
|
||||
wka_in.h = reduced_val2;
|
||||
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
|
||||
reducer.reducePacket(wka_out.h, &reduced_val2);
|
||||
#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
|
||||
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
|
||||
reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
|
||||
reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
|
||||
#else
|
||||
|
Loading…
Reference in New Issue
Block a user