mirror of
https://gitlab.com/libeigen/eigen.git
synced 2024-12-21 07:19:46 +08:00
Speedup trivial tensor broadcasting on GPU by enforcing unaligned loads. See PR 437.
This commit is contained in:
parent
723856dec1
commit
679eece876
@ -284,7 +284,13 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
|||||||
|
|
||||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
if (isCopy) {
|
if (isCopy) {
|
||||||
|
#ifdef EIGEN_GPU_COMPILE_PHASE
|
||||||
|
// See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
|
||||||
|
// unaligned loads here. The reason is unclear though.
|
||||||
|
return m_impl.template packet<Unaligned>(index);
|
||||||
|
#else
|
||||||
return m_impl.template packet<LoadMode>(index);
|
return m_impl.template packet<LoadMode>(index);
|
||||||
|
#endif
|
||||||
} else if (oneByN && !nByOne) {
|
} else if (oneByN && !nByOne) {
|
||||||
return packetNByOne<LoadMode>(index);
|
return packetNByOne<LoadMode>(index);
|
||||||
} else if (!oneByN && nByOne) {
|
} else if (!oneByN && nByOne) {
|
||||||
@ -296,7 +302,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (isCopy) {
|
if (isCopy) {
|
||||||
|
#ifdef EIGEN_GPU_COMPILE_PHASE
|
||||||
|
// See above.
|
||||||
|
return m_impl.template packet<Unaligned>(index);
|
||||||
|
#else
|
||||||
return m_impl.template packet<LoadMode>(index);
|
return m_impl.template packet<LoadMode>(index);
|
||||||
|
#endif
|
||||||
} else if (oneByN && !nByOne) {
|
} else if (oneByN && !nByOne) {
|
||||||
return packetOneByN<LoadMode>(index);
|
return packetOneByN<LoadMode>(index);
|
||||||
} else if (!oneByN && nByOne) {
|
} else if (!oneByN && nByOne) {
|
||||||
|
Loading…
Reference in New Issue
Block a user