Revert bit_cast to use memcpy for CUDA.

To elide the memcpy, we need to first load the `src` value into
registers by making a local copy. This avoids the need to resort
to potential UB by using `reinterpret_cast`.

This change doesn't seem to affect CPU (at least not with gcc/clang).
With optimizations on, the copy is also elided.
This commit is contained in:
Antonio Sanchez 2021-10-21 08:11:02 -07:00
parent 45e67a6fda
commit b86e013321

View File

@ -91,20 +91,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) {
EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Tgt>::value && std::is_default_constructible<Tgt>::value, EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Tgt>::value && std::is_default_constructible<Tgt>::value,
THIS_TYPE_IS_NOT_SUPPORTED); THIS_TYPE_IS_NOT_SUPPORTED);
#endif #endif
EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED); EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED);
// On GPU, the standard memcpy approach is not elided, actually producing an Tgt tgt;
// expensive memcpy. The standard (as used by the CUDA library, and suggested // Load src into registers first. This allows the memcpy to be elided by CUDA.
// in multiple forums) seems to be to violate strict aliasing rules. const Src staged = src;
#if defined(EIGEN_GPU_COMPILE_PHASE) EIGEN_USING_STD(memcpy)
return *reinterpret_cast<const Tgt*>(&src); memcpy(&tgt, &staged, sizeof(Tgt));
#else return tgt;
Tgt tgt;
EIGEN_USING_STD(memcpy)
memcpy(&tgt, &src, sizeof(Tgt));
return tgt;
#endif
} }
} // namespace numext } // namespace numext