mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-21 07:19:46 +08:00
Revert bit_cast to use memcpy for CUDA.
To elide the memcpy, we need to first load the `src` value into registers by making a local copy. This avoids the need to resort to potential UB by using `reinterpret_cast`. This change doesn't seem to affect CPU (at least not with gcc/clang). With optimizations on, the copy is also elided.
This commit is contained in:
parent
45e67a6fda
commit
b86e013321
@ -91,20 +91,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) {
|
|||||||
EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Tgt>::value && std::is_default_constructible<Tgt>::value,
|
EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Tgt>::value && std::is_default_constructible<Tgt>::value,
|
||||||
THIS_TYPE_IS_NOT_SUPPORTED);
|
THIS_TYPE_IS_NOT_SUPPORTED);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED);
|
EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED);
|
||||||
|
|
||||||
// On GPU, the standard memcpy approach is not elided, actually producing an
|
Tgt tgt;
|
||||||
// expensive memcpy. The standard (as used by the CUDA library, and suggested
|
// Load src into registers first. This allows the memcpy to be elided by CUDA.
|
||||||
// in multiple forums) seems to be to violate strict aliasing rules.
|
const Src staged = src;
|
||||||
#if defined(EIGEN_GPU_COMPILE_PHASE)
|
EIGEN_USING_STD(memcpy)
|
||||||
return *reinterpret_cast<const Tgt*>(&src);
|
memcpy(&tgt, &staged, sizeof(Tgt));
|
||||||
#else
|
return tgt;
|
||||||
Tgt tgt;
|
|
||||||
EIGEN_USING_STD(memcpy)
|
|
||||||
memcpy(&tgt, &src, sizeof(Tgt));
|
|
||||||
return tgt;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
} // namespace numext
|
} // namespace numext
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user