Specialize std::complex operators for use on GPU device.

NVCC and older versions of clang do not fully support `std::complex` on device,
leading to either compile errors (Cannot call `__host__` function) or worse,
runtime errors (Illegal instruction).  For most functions, we can
implement specialized `numext` versions. Here we specialize the standard
operators (with the exception of stream operators and member function operators
with a scalar that are already specialized in `<complex>`) so they can be used
in device code as well.

To import these operators into the current scope, use
`EIGEN_USING_STD_COMPLEX_OPERATORS`. By default, these are imported into
the `Eigen`, `Eigen:internal`, and `Eigen::numext` namespaces.

This allow us to remove specializations of the
sum/difference/product/quotient ops, and allow us to treat complex
numbers like most other scalars (e.g. in tests).
This commit is contained in:
Antonio Sanchez 2021-01-06 09:41:15 -08:00 committed by Antonio Sánchez
parent 65e2169c45
commit f19bcffee6
3 changed files with 336 additions and 76 deletions

View File

@ -2,6 +2,7 @@
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2021 C. Antonio Sanchez <cantonios@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
@ -14,85 +15,231 @@
#if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE)
// Many std::complex methods such as operator+, operator-, operator* and
// operator/ are not constexpr. Due to this, GCC and older versions of clang do
// not treat them as device functions and thus Eigen functors making use of
// these operators fail to compile. Here, we manually specialize these
// operators and functors for complex types when building for CUDA to enable
// their use on-device.
// Import Eigen's internal operator specializations.
#define EIGEN_USING_STD_COMPLEX_OPERATORS \
using Eigen::complex_operator_detail::operator+; \
using Eigen::complex_operator_detail::operator-; \
using Eigen::complex_operator_detail::operator*; \
using Eigen::complex_operator_detail::operator/; \
using Eigen::complex_operator_detail::operator+=; \
using Eigen::complex_operator_detail::operator-=; \
using Eigen::complex_operator_detail::operator*=; \
using Eigen::complex_operator_detail::operator/=; \
using Eigen::complex_operator_detail::operator==; \
using Eigen::complex_operator_detail::operator!=;
namespace Eigen {
// Specialized std::complex overloads.
namespace complex_operator_detail {
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<T> complex_multiply(const std::complex<T>& a, const std::complex<T>& b) {
const T a_real = numext::real(a);
const T a_imag = numext::imag(a);
const T b_real = numext::real(b);
const T b_imag = numext::imag(b);
return std::complex<T>(
a_real * b_real - a_imag * b_imag,
a_imag * b_real + a_real * b_imag);
}
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<T> complex_divide_fast(const std::complex<T>& a, const std::complex<T>& b) {
const T a_real = numext::real(a);
const T a_imag = numext::imag(a);
const T b_real = numext::real(b);
const T b_imag = numext::imag(b);
const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
(a_imag * b_real - a_real * b_imag) * norm);
}
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<T> complex_divide_stable(const std::complex<T>& a, const std::complex<T>& b) {
const T b_real = numext::real(b);
const T b_imag = numext::imag(b);
// Guard against over/under-flow.
const T scale = T(1) / (numext::abs(b_real) + numext::abs(b_imag));
const T a_real_scaled = numext::real(a) * scale;
const T a_imag_scaled = numext::imag(a) * scale;
const T b_real_scaled = b_real * scale;
const T b_imag_scaled = b_imag * scale;
const T b_norm2_scaled = b_real_scaled * b_real_scaled + b_imag_scaled * b_imag_scaled;
return std::complex<T>(
(a_real_scaled * b_real_scaled + a_imag_scaled * b_imag_scaled) / b_norm2_scaled,
(a_imag_scaled * b_real_scaled - a_real_scaled * b_imag_scaled) / b_norm2_scaled);
}
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<T> complex_divide(const std::complex<T>& a, const std::complex<T>& b) {
#if EIGEN_FAST_MATH
return complex_divide_fast(a, b);
#else
return complex_divide_stable(a, b);
#endif
}
// NOTE: We cannot specialize compound assignment operators with Scalar T,
// (i.e. operator@=(const T&), for @=+,-,*,/)
// since they are already specialized for float/double/long double within
// the standard <complex> header. We also do not specialize the stream
// operators.
#define EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(T) \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T> operator+(const std::complex<T>& a) { return a; } \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T> operator-(const std::complex<T>& a) { \
return std::complex<T>(-numext::real(a), -numext::imag(a)); \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T> operator+(const std::complex<T>& a, const std::complex<T>& b) { \
return std::complex<T>(numext::real(a) + numext::real(b), numext::imag(a) + numext::imag(b)); \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T> operator+(const std::complex<T>& a, const T& b) { \
return std::complex<T>(numext::real(a) + b, numext::imag(a)); \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T> operator+(const T& a, const std::complex<T>& b) { \
return std::complex<T>(a + numext::real(b), numext::imag(b)); \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T> operator-(const std::complex<T>& a, const std::complex<T>& b) { \
return std::complex<T>(numext::real(a) - numext::real(b), numext::imag(a) - numext::imag(b)); \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T> operator-(const std::complex<T>& a, const T& b) { \
return std::complex<T>(numext::real(a) - b, numext::imag(a)); \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T> operator-(const T& a, const std::complex<T>& b) { \
return std::complex<T>(a - numext::real(b), -numext::imag(b)); \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T> operator*(const std::complex<T>& a, const std::complex<T>& b) { \
return complex_multiply(a, b); \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T> operator*(const std::complex<T>& a, const T& b) { \
return std::complex<T>(numext::real(a) * b, numext::imag(a) * b); \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T> operator*(const T& a, const std::complex<T>& b) { \
return std::complex<T>(a * numext::real(b), a * numext::imag(b)); \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T> operator/(const std::complex<T>& a, const std::complex<T>& b) { \
return complex_divide(a, b); \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T> operator/(const std::complex<T>& a, const T& b) { \
return std::complex<T>(numext::real(a) / b, numext::imag(a) / b); \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T> operator/(const T& a, const std::complex<T>& b) { \
return complex_divide(std::complex<T>(a, 0), b); \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T>& operator+=(std::complex<T>& a, const std::complex<T>& b) { \
numext::real_ref(a) += numext::real(b); \
numext::imag_ref(a) += numext::imag(b); \
return a; \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T>& operator-=(std::complex<T>& a, const std::complex<T>& b) { \
numext::real_ref(a) -= numext::real(b); \
numext::imag_ref(a) -= numext::imag(b); \
return a; \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T>& operator*=(std::complex<T>& a, const std::complex<T>& b) { \
a = complex_multiply(a, b); \
return a; \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
std::complex<T>& operator/=(std::complex<T>& a, const std::complex<T>& b) { \
a = complex_divide(a, b); \
return a; \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
bool operator==(const std::complex<T>& a, const std::complex<T>& b) { \
return numext::real(a) == numext::real(b) && numext::imag(a) == numext::imag(b); \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
bool operator==(const std::complex<T>& a, const T& b) { \
return numext::real(a) == b && numext::imag(a) == 0; \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
bool operator==(const T& a, const std::complex<T>& b) { \
return a == numext::real(b) && 0 == numext::imag(b); \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
bool operator!=(const std::complex<T>& a, const std::complex<T>& b) { \
return !(a == b); \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
bool operator!=(const std::complex<T>& a, const T& b) { \
return !(a == b); \
} \
\
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
bool operator!=(const T& a, const std::complex<T>& b) { \
return !(a == b); \
}
// Do not specialize for long double, since that reduces to double on device.
EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(float)
EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(double)
#undef EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS
} // namespace complex_operator_detail
EIGEN_USING_STD_COMPLEX_OPERATORS
namespace numext {
EIGEN_USING_STD_COMPLEX_OPERATORS
} // namespace numext
namespace internal {
// Many std::complex methods such as operator+, operator-, operator* and
// operator/ are not constexpr. Due to this, clang does not treat them as device
// functions and thus Eigen functors making use of these operators fail to
// compile. Here, we manually specialize these functors for complex types when
// building for CUDA to avoid non-constexpr methods.
// Sum
template<typename T> struct scalar_sum_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
typedef typename std::complex<T> result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
return std::complex<T>(numext::real(a) + numext::real(b),
numext::imag(a) + numext::imag(b));
}
};
template<typename T> struct scalar_sum_op<std::complex<T>, std::complex<T> > : scalar_sum_op<const std::complex<T>, const std::complex<T> > {};
// Difference
template<typename T> struct scalar_difference_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
typedef typename std::complex<T> result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
return std::complex<T>(numext::real(a) - numext::real(b),
numext::imag(a) - numext::imag(b));
}
};
template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T> > : scalar_difference_op<const std::complex<T>, const std::complex<T> > {};
// Product
template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
enum {
Vectorizable = packet_traits<std::complex<T> >::HasMul
};
typedef typename std::complex<T> result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
const T a_real = numext::real(a);
const T a_imag = numext::imag(a);
const T b_real = numext::real(b);
const T b_imag = numext::imag(b);
return std::complex<T>(a_real * b_real - a_imag * b_imag,
a_real * b_imag + a_imag * b_real);
}
};
template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> > : scalar_product_op<const std::complex<T>, const std::complex<T> > {};
// Quotient
template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
enum {
Vectorizable = packet_traits<std::complex<T> >::HasDiv
};
typedef typename std::complex<T> result_type;
EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator() (const std::complex<T>& a, const std::complex<T>& b) const {
const T a_real = numext::real(a);
const T a_imag = numext::imag(a);
const T b_real = numext::real(b);
const T b_imag = numext::imag(b);
const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
(a_imag * b_real - a_real * b_imag) * norm);
}
};
template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
EIGEN_USING_STD_COMPLEX_OPERATORS
} // namespace internal
} // namespace Eigen

View File

@ -106,6 +106,116 @@ struct complex_sqrt {
}
};
template<typename T>
struct complex_operators {
EIGEN_DEVICE_FUNC
void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
{
using namespace Eigen;
typedef typename T::Scalar ComplexType;
typedef typename T::Scalar::value_type ValueType;
const int num_scalar_operators = 24;
const int num_vector_operators = 23; // no unary + operator.
int out_idx = i * (num_scalar_operators + num_vector_operators * T::MaxSizeAtCompileTime);
// Scalar operators.
const ComplexType a = in[i];
const ComplexType b = in[i + 1];
out[out_idx++] = +a;
out[out_idx++] = -a;
out[out_idx++] = a + b;
out[out_idx++] = a + numext::real(b);
out[out_idx++] = numext::real(a) + b;
out[out_idx++] = a - b;
out[out_idx++] = a - numext::real(b);
out[out_idx++] = numext::real(a) - b;
out[out_idx++] = a * b;
out[out_idx++] = a * numext::real(b);
out[out_idx++] = numext::real(a) * b;
out[out_idx++] = a / b;
out[out_idx++] = a / numext::real(b);
out[out_idx++] = numext::real(a) / b;
out[out_idx] = a; out[out_idx++] += b;
out[out_idx] = a; out[out_idx++] -= b;
out[out_idx] = a; out[out_idx++] *= b;
out[out_idx] = a; out[out_idx++] /= b;
const ComplexType true_value = ComplexType(ValueType(1), ValueType(0));
const ComplexType false_value = ComplexType(ValueType(0), ValueType(0));
out[out_idx++] = (a == b ? true_value : false_value);
out[out_idx++] = (a == numext::real(b) ? true_value : false_value);
out[out_idx++] = (numext::real(a) == b ? true_value : false_value);
out[out_idx++] = (a != b ? true_value : false_value);
out[out_idx++] = (a != numext::real(b) ? true_value : false_value);
out[out_idx++] = (numext::real(a) != b ? true_value : false_value);
// Vector versions.
T x1(in + i);
T x2(in + i + 1);
const int res_size = T::MaxSizeAtCompileTime * num_scalar_operators;
const int size = T::MaxSizeAtCompileTime;
int block_idx = 0;
Map<VectorX<ComplexType>> res(out + out_idx, res_size);
res.segment(block_idx, size) = -x1;
block_idx += size;
res.segment(block_idx, size) = x1 + x2;
block_idx += size;
res.segment(block_idx, size) = x1 + x2.real();
block_idx += size;
res.segment(block_idx, size) = x1.real() + x2;
block_idx += size;
res.segment(block_idx, size) = x1 - x2;
block_idx += size;
res.segment(block_idx, size) = x1 - x2.real();
block_idx += size;
res.segment(block_idx, size) = x1.real() - x2;
block_idx += size;
res.segment(block_idx, size) = x1.array() * x2.array();
block_idx += size;
res.segment(block_idx, size) = x1.array() * x2.real().array();
block_idx += size;
res.segment(block_idx, size) = x1.real().array() * x2.array();
block_idx += size;
res.segment(block_idx, size) = x1.array() / x2.array();
block_idx += size;
res.segment(block_idx, size) = x1.array() / x2.real().array();
block_idx += size;
res.segment(block_idx, size) = x1.real().array() / x2.array();
block_idx += size;
res.segment(block_idx, size) = x1; res.segment(block_idx, size) += x2;
block_idx += size;
res.segment(block_idx, size) = x1; res.segment(block_idx, size) -= x2;
block_idx += size;
res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() *= x2.array();
block_idx += size;
res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() /= x2.array();
block_idx += size;
// Equality comparisons currently not functional on device
// (std::equal_to<T> is host-only).
// const T true_vector = T::Constant(true_value);
// const T false_vector = T::Constant(false_value);
// res.segment(block_idx, size) = (x1 == x2 ? true_vector : false_vector);
// block_idx += size;
// res.segment(block_idx, size) = (x1 == x2.real() ? true_vector : false_vector);
// block_idx += size;
// res.segment(block_idx, size) = (x1.real() == x2 ? true_vector : false_vector);
// block_idx += size;
// res.segment(block_idx, size) = (x1 != x2 ? true_vector : false_vector);
// block_idx += size;
// res.segment(block_idx, size) = (x1 != x2.real() ? true_vector : false_vector);
// block_idx += size;
// res.segment(block_idx, size) = (x1.real() != x2 ? true_vector : false_vector);
// block_idx += size;
}
};
template<typename T>
struct replicate {
EIGEN_DEVICE_FUNC
@ -297,6 +407,8 @@ EIGEN_DECLARE_TEST(gpu_basic)
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
// Test std::complex.
CALL_SUBTEST( run_and_compare_to_gpu(complex_operators<Vector3cf>(), nthreads, cfin, cfout) );
CALL_SUBTEST( test_with_infs_nans(complex_sqrt<Vector3cf>(), nthreads, cfin, cfout) );
#if defined(__NVCC__)

View File

@ -117,6 +117,7 @@ struct compile_time_device_info {
void operator()(int i, const int* /*in*/, int* info) const
{
if (i == 0) {
EIGEN_UNUSED_VARIABLE(info)
#if defined(__CUDA_ARCH__)
info[0] = int(__CUDA_ARCH__ +0);
#endif