Avoid using the cuda memcpy for small tensor slices since the memcpy kernel is very expensive to launch

This commit is contained in:
Benoit Steiner 2015-05-19 15:19:01 -07:00
parent a81d17b73a
commit 2451679951

View File

@ -283,6 +283,26 @@ class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, X
};
// Fixme: figure out the exact threshold
namespace {
template <typename Index, typename Device> struct MemcpyTriggerForSlicing {
EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { }
EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > threshold_; }
private:
Index threshold_;
};
// It is very expensive to start the memcpy kernel on GPU: we therefore only
// use it for large copies.
#ifdef EIGEN_USE_GPU
template <typename Index> struct MemcpyTriggerForSlicing<Index, GpuDevice> {
EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { }
EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; }
};
#endif
}
// Eval as rvalue
template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
@ -364,7 +384,8 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
}
}
// Use memcpy if it's going to be faster than using the regular evaluation.
if (contiguous_values > static_cast<Index>(2 * m_device.numThreads())) {
const MemcpyTriggerForSlicing<Index, Device> trigger(m_device);
if (trigger(contiguous_values)) {
Scalar* src = m_impl.data();
for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
Index offset = srcCoeff(i);