Allow vectorized padding on GPU. This helps speed things up a little

Before:
BM_padding/10            5000000        460   217.03 MFlops/s
BM_padding/80            5000000        460 13899.40 MFlops/s
BM_padding/640           5000000        461 888421.17 MFlops/s
BM_padding/4K            5000000        460 54316322.55 MFlops/s
After:
BM_padding/10            5000000        454   220.20 MFlops/s
BM_padding/80            5000000        455 14039.86 MFlops/s
BM_padding/640           5000000        452 904968.83 MFlops/s
BM_padding/4K            5000000        411 60750049.21 MFlops/s
This commit is contained in:
Benoit Steiner 2016-05-17 09:17:26 -07:00
parent 86da77cb9b
commit 5fa27574dd

View File

@ -93,7 +93,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
enum {
IsAligned = false,
IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = true,